diff --git a/nancy/__init__.py b/nancy/__init__.py new file mode 100644 index 0000000..4551337 --- /dev/null +++ b/nancy/__init__.py @@ -0,0 +1,64 @@ +import sqlite3 + +from . import db + +__version__ = "0.0.1" + + +# Calling code will build up a large DAG then at the very last step, call +# nancy.save_data(). Only at that point will we determine where to save the +# data by asking the user for an output dir. However, we need a database +# initialized in order to build up the DAG in the first place, so here we +# initialize an in-memory database to use until we have an output directory. +_conn = db.init(":memory:") + + +def save_data( + destination_mapping, + desc=f"Provenance-tracking with nancy v{__version__}", +): + """ + A command-line interface to compute and save a set of outputs. + + Ex: + + .. code: + save_data({ + "scores.csv": scores, + "checkpoints": checkpoint_dir, + "plots/learning_curve.pdf": learning_curve_plot, + "plots/test_roc.pdf": test_auc_plot, + " + }, + """ + global _conn + + import argparse + import os + import sys + + parser = argparse.ArgumentParser(description=desc) + parser.add_argument( + "--out_dir", + "-o", + required=True, + help="Directory (will be created) in which to save all outputs", + ) + args = parser.parse_args() + + if os.path.exists(args.out_dir): + print( + f"Output directory {args.out_dir} exists. Refusing to overwrite.", + file=sys.stderr, + ) + sys.exit(1) + + # Create the output directory, back up the in-memory db to this location, an + # switch to it. + os.makedirs(args.out_dir, exist_ok=True) + dst_conn = sqlite3.connect(os.path.join(args.out_dir, "prov.db")) + _conn.backup(dst_conn) + _conn.close() + _conn = dst_conn + + # Now we save the provided roots, and record their locations in _conn.data diff --git a/nancy/db.py b/nancy/db.py new file mode 100644 index 0000000..573e26a --- /dev/null +++ b/nancy/db.py @@ -0,0 +1,21 @@ +import importlib.resources +import os +import sqlite3 + + +def init(path): + """ + Initialize a data store directory. + """ + if os.path.exists(path): + raise FileExistsError(f"Refusing to overwrite existing database {path}") + schema = importlib.resources.open_text("nancy", "schema.sql").read() + conn = sqlite3.connect(path) + conn.cursor().executescript(schema) + return conn + + +def connect(path): + conn = sqlite3.connect(path) + conn.cursor().execute("PRAGMA foreign_keys = ON;") + return conn diff --git a/dprov/store/schema.sql b/nancy/schema.sql similarity index 79% rename from dprov/store/schema.sql rename to nancy/schema.sql index 5e3d129..0417674 100644 --- a/dprov/store/schema.sql +++ b/nancy/schema.sql @@ -1,4 +1,4 @@ --- Schema file for dprov.db +-- Schema file for nancy.db PRAGMA user_version=0; -- version of this schema. -- ^^^^^ Increment the above whenever meaningful updates are made to this file. @@ -6,7 +6,7 @@ PRAGMA foreign_keys=ON; -- computational environment which can execute "runs" CREATE TABLE environment (id INTEGER PRIMARY KEY NOT NULL, - hostname TEXT, -- platform.node(): 'lucky' + hostname TEXT, -- platform.node(): computer's network name envvars_json TEXT, -- json.dumps(dict(os.environ)) python_implementation TEXT, -- platform.python_implementation(): 'cpython' python_strversion TEXT, -- sys.version: '3.9.7 (default, Sep 16 2021, 13:09:58) \n[GCC 7.5.0]' @@ -16,7 +16,7 @@ CREATE TABLE environment (id INTEGER PRIMARY KEY NOT NULL, -- A "process" is an execution of a single graph, starting from a number of roots -- It corresponds to a single host process. Note that if multiple calls to --- dprov.run() are provided within a single python script, multiple "processes" +-- nancy.run() are provided within a single python script, multiple "processes" -- may be created. Also note that historical processes are automatically -- imported and merged when possible when loading "data" from disk. CREATE TABLE process (id INTEGER PRIMARY KEY NOT NULL, @@ -26,7 +26,7 @@ CREATE TABLE process (id INTEGER PRIMARY KEY NOT NULL, end_time REAL, process_id INTEGER, -- host PID of python process on host OS environment INTEGER NOT NULL, - FOREIGN KEY (environment) REFERENCES environment (id) ON DELETE RESTRICT + FOREIGN KEY (environment) REFERENCES environment (id) ); -- We try to track all python packages that impact execution by traversing a -- copy of sys.modules. This is done once before a "process" and once after in @@ -35,7 +35,7 @@ CREATE TABLE py_package (id INTEGER PRIMARY KEY NOT NULL, name TEXT NOT NULL, version TEXT, process INTEGER NOT NULL, - FOREIGN KEY (process) REFERENCES process (id) ON DELETE RESTRICT + FOREIGN KEY (process) REFERENCES process (id) ); -- A py_module describes any python module file containing decorated Functions. -- Modules are tracked since they impact the global scope of function calls. @@ -43,7 +43,7 @@ CREATE TABLE py_module(id INTEGER PRIMARY KEY NOT NULL, name TEXT NOT NULL, code TEXT, package INTEGER, - FOREIGN KEY (package) REFERENCES py_package (id) ON DELETE RESTRICT + FOREIGN KEY (package) REFERENCES py_package (id) ); -- A "func" just describes a function, without reference to its arguments. It -- can have inputs and outputs, which are described in the func_inputs and @@ -55,20 +55,30 @@ CREATE TABLE func(id INTEGER PRIMARY KEY NOT NULL, -- e.g. datetime.datetime.now().timestamp() start_time REAL, end_time REAL, - FOREIGN KEY (module) REFERENCES py_module (id) ON DELETE RESTRICT + FOREIGN KEY (module) REFERENCES py_module (id) ); CREATE TABLE func_input(id INTEGER PRIMARY KEY NOT NULL, name TEXT NOT NULL, typename TEXT NOT NULL, func INTEGER NOT NULL, - FOREIGN KEY (func) REFERENCES func (id) ON DELETE RESTRICT + + position INTEGER, + posonly BOOL, + kwonly BOOL, + + description TEXT, + + FOREIGN KEY (func) REFERENCES func (id) ); CREATE TABLE func_output(id INTEGER PRIMARY KEY NOT NULL, name TEXT, - position INTEGER NOT NULL, typename TEXT NOT NULL, func INTEGER NOT NULL, - FOREIGN KEY (func) REFERENCES func (id) ON DELETE RESTRICT + + position INTEGER NOT NULL, + description TEXT, + + FOREIGN KEY (func) REFERENCES func (id) ); @@ -79,12 +89,12 @@ CREATE TABLE func_output(id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE task(id INTEGER PRIMARY KEY NOT NULL, func INTEGER NOT NULL, process INTEGER NOT NULL, - FOREIGN KEY (func) REFERENCES func (id) ON DELETE RESTRICT, - FOREIGN KEY (process) REFERENCES process (id) ON DELETE RESTRICT + FOREIGN KEY (func) REFERENCES func (id), + FOREIGN KEY (process) REFERENCES process (id) ); -- A datum is an object that is computed as the output of a task, given as a -- literal value in a config file, or loaded from a file. -CREATE TABLE datum(id INTEGER PRIMARY KEY NOT NULL, +CREATE TABLE data(id INTEGER PRIMARY KEY NOT NULL, -- provider_type describes where the data came from. Choices are: -- - COMPUTED: output of a decorated Function -- - IMPORTED: imported output from a prior process @@ -100,7 +110,8 @@ CREATE TABLE datum(id INTEGER PRIMARY KEY NOT NULL, typename TEXT NOT NULL, -- string representation of the data type - path TEXT, -- if residing on disk, the path relative to this db file + saved_path TEXT, -- if residing on disk, the path relative to this db file + saved_version TEXT, -- if residing on disk, the path relative to this db file -- hash of provenance summarizes all dependencies, independent of environment -- computed _before_ computation @@ -112,8 +123,8 @@ CREATE TABLE datum(id INTEGER PRIMARY KEY NOT NULL, -- incremented current_version INTEGER NOT NULL, - FOREIGN KEY (task) REFERENCES task (id) ON DELETE RESTRICT, - FOREIGN KEY (task_output) REFERENCES func_output (id) ON DELETE RESTRICT + FOREIGN KEY (task) REFERENCES task (id), + FOREIGN KEY (task_output) REFERENCES func_output (id) ); -- A task input records the version of a Datum that is passed to the function CREATE TABLE task_input(id INTEGER PRIMARY KEY NOT NULL, @@ -125,7 +136,7 @@ CREATE TABLE task_input(id INTEGER PRIMARY KEY NOT NULL, -- is passed to a non-const operation, a new datum is provided datum_version INTEGER NOT NULL, - FOREIGN KEY (task) REFERENCES task (id) ON DELETE RESTRICT, - FOREIGN KEY (func_input) REFERENCES func_input (id) ON DELETE RESTRICT, - FOREIGN KEY (datum) REFERENCES datum (id) ON DELETE RESTRICT + FOREIGN KEY (task) REFERENCES task (id), + FOREIGN KEY (func_input) REFERENCES func_input (id), + FOREIGN KEY (datum) REFERENCES datum (id) ); diff --git a/setup.cfg b/setup.cfg index 384435e..c360e13 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,21 +1,21 @@ [metadata] -name = DProv +name = nancy author = Jacob Hinkle author_email = hinklejd@ornl.gov -url = https://code.ornl.gov/4jh/dprov +url = https://code.ornl.gov/4jh/nancy keywords = data-science workflows reproducibility provenance version = attr: setuptools_scm.get_version [tool:pytest] testpaths = tests -addopts = --verbose --cov dprov/ +addopts = --verbose --cov nancy/ [aliases] test=pytest [options] packages = - superfun + nancy install_requires = dill setup_requires = @@ -25,14 +25,18 @@ tests_require = pytest pytest-cov pytest-runner -python_requires = >=3.6 +python_requires = >=3.7 use_scm_version = True +[options.package_data] +nancy = + *.sql + [flake8] exclude = .eggs, .git, __pycache__, - dprov.egg-info, + nancy.egg-info, build, dist # allow a little bit longer lines than default (79)