Rename to nancy
This commit is contained in:
parent
6858489b0b
commit
007c22d3e2
64
nancy/__init__.py
Normal file
64
nancy/__init__.py
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
import sqlite3
|
||||||
|
|
||||||
|
from . import db
|
||||||
|
|
||||||
|
__version__ = "0.0.1"
|
||||||
|
|
||||||
|
|
||||||
|
# Calling code will build up a large DAG then at the very last step, call
|
||||||
|
# nancy.save_data(). Only at that point will we determine where to save the
|
||||||
|
# data by asking the user for an output dir. However, we need a database
|
||||||
|
# initialized in order to build up the DAG in the first place, so here we
|
||||||
|
# initialize an in-memory database to use until we have an output directory.
|
||||||
|
_conn = db.init(":memory:")
|
||||||
|
|
||||||
|
|
||||||
|
def save_data(
|
||||||
|
destination_mapping,
|
||||||
|
desc=f"Provenance-tracking with nancy v{__version__}",
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
A command-line interface to compute and save a set of outputs.
|
||||||
|
|
||||||
|
Ex:
|
||||||
|
|
||||||
|
.. code:
|
||||||
|
save_data({
|
||||||
|
"scores.csv": scores,
|
||||||
|
"checkpoints": checkpoint_dir,
|
||||||
|
"plots/learning_curve.pdf": learning_curve_plot,
|
||||||
|
"plots/test_roc.pdf": test_auc_plot,
|
||||||
|
"
|
||||||
|
},
|
||||||
|
"""
|
||||||
|
global _conn
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description=desc)
|
||||||
|
parser.add_argument(
|
||||||
|
"--out_dir",
|
||||||
|
"-o",
|
||||||
|
required=True,
|
||||||
|
help="Directory (will be created) in which to save all outputs",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if os.path.exists(args.out_dir):
|
||||||
|
print(
|
||||||
|
f"Output directory {args.out_dir} exists. Refusing to overwrite.",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Create the output directory, back up the in-memory db to this location, an
|
||||||
|
# switch to it.
|
||||||
|
os.makedirs(args.out_dir, exist_ok=True)
|
||||||
|
dst_conn = sqlite3.connect(os.path.join(args.out_dir, "prov.db"))
|
||||||
|
_conn.backup(dst_conn)
|
||||||
|
_conn.close()
|
||||||
|
_conn = dst_conn
|
||||||
|
|
||||||
|
# Now we save the provided roots, and record their locations in _conn.data
|
||||||
21
nancy/db.py
Normal file
21
nancy/db.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
import importlib.resources
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
|
||||||
|
def init(path):
|
||||||
|
"""
|
||||||
|
Initialize a data store directory.
|
||||||
|
"""
|
||||||
|
if os.path.exists(path):
|
||||||
|
raise FileExistsError(f"Refusing to overwrite existing database {path}")
|
||||||
|
schema = importlib.resources.open_text("nancy", "schema.sql").read()
|
||||||
|
conn = sqlite3.connect(path)
|
||||||
|
conn.cursor().executescript(schema)
|
||||||
|
return conn
|
||||||
|
|
||||||
|
|
||||||
|
def connect(path):
|
||||||
|
conn = sqlite3.connect(path)
|
||||||
|
conn.cursor().execute("PRAGMA foreign_keys = ON;")
|
||||||
|
return conn
|
||||||
@ -1,4 +1,4 @@
|
|||||||
-- Schema file for dprov.db
|
-- Schema file for nancy.db
|
||||||
PRAGMA user_version=0; -- version of this schema.
|
PRAGMA user_version=0; -- version of this schema.
|
||||||
-- ^^^^^ Increment the above whenever meaningful updates are made to this file.
|
-- ^^^^^ Increment the above whenever meaningful updates are made to this file.
|
||||||
|
|
||||||
@ -6,7 +6,7 @@ PRAGMA foreign_keys=ON;
|
|||||||
|
|
||||||
-- computational environment which can execute "runs"
|
-- computational environment which can execute "runs"
|
||||||
CREATE TABLE environment (id INTEGER PRIMARY KEY NOT NULL,
|
CREATE TABLE environment (id INTEGER PRIMARY KEY NOT NULL,
|
||||||
hostname TEXT, -- platform.node(): 'lucky'
|
hostname TEXT, -- platform.node(): computer's network name
|
||||||
envvars_json TEXT, -- json.dumps(dict(os.environ))
|
envvars_json TEXT, -- json.dumps(dict(os.environ))
|
||||||
python_implementation TEXT, -- platform.python_implementation(): 'cpython'
|
python_implementation TEXT, -- platform.python_implementation(): 'cpython'
|
||||||
python_strversion TEXT, -- sys.version: '3.9.7 (default, Sep 16 2021, 13:09:58) \n[GCC 7.5.0]'
|
python_strversion TEXT, -- sys.version: '3.9.7 (default, Sep 16 2021, 13:09:58) \n[GCC 7.5.0]'
|
||||||
@ -16,7 +16,7 @@ CREATE TABLE environment (id INTEGER PRIMARY KEY NOT NULL,
|
|||||||
|
|
||||||
-- A "process" is an execution of a single graph, starting from a number of roots
|
-- A "process" is an execution of a single graph, starting from a number of roots
|
||||||
-- It corresponds to a single host process. Note that if multiple calls to
|
-- It corresponds to a single host process. Note that if multiple calls to
|
||||||
-- dprov.run() are provided within a single python script, multiple "processes"
|
-- nancy.run() are provided within a single python script, multiple "processes"
|
||||||
-- may be created. Also note that historical processes are automatically
|
-- may be created. Also note that historical processes are automatically
|
||||||
-- imported and merged when possible when loading "data" from disk.
|
-- imported and merged when possible when loading "data" from disk.
|
||||||
CREATE TABLE process (id INTEGER PRIMARY KEY NOT NULL,
|
CREATE TABLE process (id INTEGER PRIMARY KEY NOT NULL,
|
||||||
@ -26,7 +26,7 @@ CREATE TABLE process (id INTEGER PRIMARY KEY NOT NULL,
|
|||||||
end_time REAL,
|
end_time REAL,
|
||||||
process_id INTEGER, -- host PID of python process on host OS
|
process_id INTEGER, -- host PID of python process on host OS
|
||||||
environment INTEGER NOT NULL,
|
environment INTEGER NOT NULL,
|
||||||
FOREIGN KEY (environment) REFERENCES environment (id) ON DELETE RESTRICT
|
FOREIGN KEY (environment) REFERENCES environment (id)
|
||||||
);
|
);
|
||||||
-- We try to track all python packages that impact execution by traversing a
|
-- We try to track all python packages that impact execution by traversing a
|
||||||
-- copy of sys.modules. This is done once before a "process" and once after in
|
-- copy of sys.modules. This is done once before a "process" and once after in
|
||||||
@ -35,7 +35,7 @@ CREATE TABLE py_package (id INTEGER PRIMARY KEY NOT NULL,
|
|||||||
name TEXT NOT NULL,
|
name TEXT NOT NULL,
|
||||||
version TEXT,
|
version TEXT,
|
||||||
process INTEGER NOT NULL,
|
process INTEGER NOT NULL,
|
||||||
FOREIGN KEY (process) REFERENCES process (id) ON DELETE RESTRICT
|
FOREIGN KEY (process) REFERENCES process (id)
|
||||||
);
|
);
|
||||||
-- A py_module describes any python module file containing decorated Functions.
|
-- A py_module describes any python module file containing decorated Functions.
|
||||||
-- Modules are tracked since they impact the global scope of function calls.
|
-- Modules are tracked since they impact the global scope of function calls.
|
||||||
@ -43,7 +43,7 @@ CREATE TABLE py_module(id INTEGER PRIMARY KEY NOT NULL,
|
|||||||
name TEXT NOT NULL,
|
name TEXT NOT NULL,
|
||||||
code TEXT,
|
code TEXT,
|
||||||
package INTEGER,
|
package INTEGER,
|
||||||
FOREIGN KEY (package) REFERENCES py_package (id) ON DELETE RESTRICT
|
FOREIGN KEY (package) REFERENCES py_package (id)
|
||||||
);
|
);
|
||||||
-- A "func" just describes a function, without reference to its arguments. It
|
-- A "func" just describes a function, without reference to its arguments. It
|
||||||
-- can have inputs and outputs, which are described in the func_inputs and
|
-- can have inputs and outputs, which are described in the func_inputs and
|
||||||
@ -55,20 +55,30 @@ CREATE TABLE func(id INTEGER PRIMARY KEY NOT NULL,
|
|||||||
-- e.g. datetime.datetime.now().timestamp()
|
-- e.g. datetime.datetime.now().timestamp()
|
||||||
start_time REAL,
|
start_time REAL,
|
||||||
end_time REAL,
|
end_time REAL,
|
||||||
FOREIGN KEY (module) REFERENCES py_module (id) ON DELETE RESTRICT
|
FOREIGN KEY (module) REFERENCES py_module (id)
|
||||||
);
|
);
|
||||||
CREATE TABLE func_input(id INTEGER PRIMARY KEY NOT NULL,
|
CREATE TABLE func_input(id INTEGER PRIMARY KEY NOT NULL,
|
||||||
name TEXT NOT NULL,
|
name TEXT NOT NULL,
|
||||||
typename TEXT NOT NULL,
|
typename TEXT NOT NULL,
|
||||||
func INTEGER NOT NULL,
|
func INTEGER NOT NULL,
|
||||||
FOREIGN KEY (func) REFERENCES func (id) ON DELETE RESTRICT
|
|
||||||
|
position INTEGER,
|
||||||
|
posonly BOOL,
|
||||||
|
kwonly BOOL,
|
||||||
|
|
||||||
|
description TEXT,
|
||||||
|
|
||||||
|
FOREIGN KEY (func) REFERENCES func (id)
|
||||||
);
|
);
|
||||||
CREATE TABLE func_output(id INTEGER PRIMARY KEY NOT NULL,
|
CREATE TABLE func_output(id INTEGER PRIMARY KEY NOT NULL,
|
||||||
name TEXT,
|
name TEXT,
|
||||||
position INTEGER NOT NULL,
|
|
||||||
typename TEXT NOT NULL,
|
typename TEXT NOT NULL,
|
||||||
func INTEGER NOT NULL,
|
func INTEGER NOT NULL,
|
||||||
FOREIGN KEY (func) REFERENCES func (id) ON DELETE RESTRICT
|
|
||||||
|
position INTEGER NOT NULL,
|
||||||
|
description TEXT,
|
||||||
|
|
||||||
|
FOREIGN KEY (func) REFERENCES func (id)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
@ -79,12 +89,12 @@ CREATE TABLE func_output(id INTEGER PRIMARY KEY NOT NULL,
|
|||||||
CREATE TABLE task(id INTEGER PRIMARY KEY NOT NULL,
|
CREATE TABLE task(id INTEGER PRIMARY KEY NOT NULL,
|
||||||
func INTEGER NOT NULL,
|
func INTEGER NOT NULL,
|
||||||
process INTEGER NOT NULL,
|
process INTEGER NOT NULL,
|
||||||
FOREIGN KEY (func) REFERENCES func (id) ON DELETE RESTRICT,
|
FOREIGN KEY (func) REFERENCES func (id),
|
||||||
FOREIGN KEY (process) REFERENCES process (id) ON DELETE RESTRICT
|
FOREIGN KEY (process) REFERENCES process (id)
|
||||||
);
|
);
|
||||||
-- A datum is an object that is computed as the output of a task, given as a
|
-- A datum is an object that is computed as the output of a task, given as a
|
||||||
-- literal value in a config file, or loaded from a file.
|
-- literal value in a config file, or loaded from a file.
|
||||||
CREATE TABLE datum(id INTEGER PRIMARY KEY NOT NULL,
|
CREATE TABLE data(id INTEGER PRIMARY KEY NOT NULL,
|
||||||
-- provider_type describes where the data came from. Choices are:
|
-- provider_type describes where the data came from. Choices are:
|
||||||
-- - COMPUTED: output of a decorated Function
|
-- - COMPUTED: output of a decorated Function
|
||||||
-- - IMPORTED: imported output from a prior process
|
-- - IMPORTED: imported output from a prior process
|
||||||
@ -100,7 +110,8 @@ CREATE TABLE datum(id INTEGER PRIMARY KEY NOT NULL,
|
|||||||
|
|
||||||
typename TEXT NOT NULL, -- string representation of the data type
|
typename TEXT NOT NULL, -- string representation of the data type
|
||||||
|
|
||||||
path TEXT, -- if residing on disk, the path relative to this db file
|
saved_path TEXT, -- if residing on disk, the path relative to this db file
|
||||||
|
saved_version TEXT, -- if residing on disk, the path relative to this db file
|
||||||
|
|
||||||
-- hash of provenance summarizes all dependencies, independent of environment
|
-- hash of provenance summarizes all dependencies, independent of environment
|
||||||
-- computed _before_ computation
|
-- computed _before_ computation
|
||||||
@ -112,8 +123,8 @@ CREATE TABLE datum(id INTEGER PRIMARY KEY NOT NULL,
|
|||||||
-- incremented
|
-- incremented
|
||||||
current_version INTEGER NOT NULL,
|
current_version INTEGER NOT NULL,
|
||||||
|
|
||||||
FOREIGN KEY (task) REFERENCES task (id) ON DELETE RESTRICT,
|
FOREIGN KEY (task) REFERENCES task (id),
|
||||||
FOREIGN KEY (task_output) REFERENCES func_output (id) ON DELETE RESTRICT
|
FOREIGN KEY (task_output) REFERENCES func_output (id)
|
||||||
);
|
);
|
||||||
-- A task input records the version of a Datum that is passed to the function
|
-- A task input records the version of a Datum that is passed to the function
|
||||||
CREATE TABLE task_input(id INTEGER PRIMARY KEY NOT NULL,
|
CREATE TABLE task_input(id INTEGER PRIMARY KEY NOT NULL,
|
||||||
@ -125,7 +136,7 @@ CREATE TABLE task_input(id INTEGER PRIMARY KEY NOT NULL,
|
|||||||
-- is passed to a non-const operation, a new datum is provided
|
-- is passed to a non-const operation, a new datum is provided
|
||||||
datum_version INTEGER NOT NULL,
|
datum_version INTEGER NOT NULL,
|
||||||
|
|
||||||
FOREIGN KEY (task) REFERENCES task (id) ON DELETE RESTRICT,
|
FOREIGN KEY (task) REFERENCES task (id),
|
||||||
FOREIGN KEY (func_input) REFERENCES func_input (id) ON DELETE RESTRICT,
|
FOREIGN KEY (func_input) REFERENCES func_input (id),
|
||||||
FOREIGN KEY (datum) REFERENCES datum (id) ON DELETE RESTRICT
|
FOREIGN KEY (datum) REFERENCES datum (id)
|
||||||
);
|
);
|
||||||
16
setup.cfg
16
setup.cfg
@ -1,21 +1,21 @@
|
|||||||
[metadata]
|
[metadata]
|
||||||
name = DProv
|
name = nancy
|
||||||
author = Jacob Hinkle
|
author = Jacob Hinkle
|
||||||
author_email = hinklejd@ornl.gov
|
author_email = hinklejd@ornl.gov
|
||||||
url = https://code.ornl.gov/4jh/dprov
|
url = https://code.ornl.gov/4jh/nancy
|
||||||
keywords = data-science workflows reproducibility provenance
|
keywords = data-science workflows reproducibility provenance
|
||||||
version = attr: setuptools_scm.get_version
|
version = attr: setuptools_scm.get_version
|
||||||
|
|
||||||
[tool:pytest]
|
[tool:pytest]
|
||||||
testpaths = tests
|
testpaths = tests
|
||||||
addopts = --verbose --cov dprov/
|
addopts = --verbose --cov nancy/
|
||||||
|
|
||||||
[aliases]
|
[aliases]
|
||||||
test=pytest
|
test=pytest
|
||||||
|
|
||||||
[options]
|
[options]
|
||||||
packages =
|
packages =
|
||||||
superfun
|
nancy
|
||||||
install_requires =
|
install_requires =
|
||||||
dill
|
dill
|
||||||
setup_requires =
|
setup_requires =
|
||||||
@ -25,14 +25,18 @@ tests_require =
|
|||||||
pytest
|
pytest
|
||||||
pytest-cov
|
pytest-cov
|
||||||
pytest-runner
|
pytest-runner
|
||||||
python_requires = >=3.6
|
python_requires = >=3.7
|
||||||
use_scm_version = True
|
use_scm_version = True
|
||||||
|
|
||||||
|
[options.package_data]
|
||||||
|
nancy =
|
||||||
|
*.sql
|
||||||
|
|
||||||
[flake8]
|
[flake8]
|
||||||
exclude = .eggs,
|
exclude = .eggs,
|
||||||
.git,
|
.git,
|
||||||
__pycache__,
|
__pycache__,
|
||||||
dprov.egg-info,
|
nancy.egg-info,
|
||||||
build,
|
build,
|
||||||
dist
|
dist
|
||||||
# allow a little bit longer lines than default (79)
|
# allow a little bit longer lines than default (79)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user