Rename to nancy

This commit is contained in:
Jacob Hinkle 2022-09-17 12:57:15 -04:00
parent 6858489b0b
commit 007c22d3e2
4 changed files with 125 additions and 25 deletions

64
nancy/__init__.py Normal file
View File

@ -0,0 +1,64 @@
import sqlite3
from . import db
__version__ = "0.0.1"
# Calling code will build up a large DAG then at the very last step, call
# nancy.save_data(). Only at that point will we determine where to save the
# data by asking the user for an output dir. However, we need a database
# initialized in order to build up the DAG in the first place, so here we
# initialize an in-memory database to use until we have an output directory.
_conn = db.init(":memory:")
def save_data(
destination_mapping,
desc=f"Provenance-tracking with nancy v{__version__}",
):
"""
A command-line interface to compute and save a set of outputs.
Ex:
.. code:
save_data({
"scores.csv": scores,
"checkpoints": checkpoint_dir,
"plots/learning_curve.pdf": learning_curve_plot,
"plots/test_roc.pdf": test_auc_plot,
"
},
"""
global _conn
import argparse
import os
import sys
parser = argparse.ArgumentParser(description=desc)
parser.add_argument(
"--out_dir",
"-o",
required=True,
help="Directory (will be created) in which to save all outputs",
)
args = parser.parse_args()
if os.path.exists(args.out_dir):
print(
f"Output directory {args.out_dir} exists. Refusing to overwrite.",
file=sys.stderr,
)
sys.exit(1)
# Create the output directory, back up the in-memory db to this location, an
# switch to it.
os.makedirs(args.out_dir, exist_ok=True)
dst_conn = sqlite3.connect(os.path.join(args.out_dir, "prov.db"))
_conn.backup(dst_conn)
_conn.close()
_conn = dst_conn
# Now we save the provided roots, and record their locations in _conn.data

21
nancy/db.py Normal file
View File

@ -0,0 +1,21 @@
import importlib.resources
import os
import sqlite3
def init(path):
"""
Initialize a data store directory.
"""
if os.path.exists(path):
raise FileExistsError(f"Refusing to overwrite existing database {path}")
schema = importlib.resources.open_text("nancy", "schema.sql").read()
conn = sqlite3.connect(path)
conn.cursor().executescript(schema)
return conn
def connect(path):
conn = sqlite3.connect(path)
conn.cursor().execute("PRAGMA foreign_keys = ON;")
return conn

View File

@ -1,4 +1,4 @@
-- Schema file for dprov.db
-- Schema file for nancy.db
PRAGMA user_version=0; -- version of this schema.
-- ^^^^^ Increment the above whenever meaningful updates are made to this file.
@ -6,7 +6,7 @@ PRAGMA foreign_keys=ON;
-- computational environment which can execute "runs"
CREATE TABLE environment (id INTEGER PRIMARY KEY NOT NULL,
hostname TEXT, -- platform.node(): 'lucky'
hostname TEXT, -- platform.node(): computer's network name
envvars_json TEXT, -- json.dumps(dict(os.environ))
python_implementation TEXT, -- platform.python_implementation(): 'cpython'
python_strversion TEXT, -- sys.version: '3.9.7 (default, Sep 16 2021, 13:09:58) \n[GCC 7.5.0]'
@ -16,7 +16,7 @@ CREATE TABLE environment (id INTEGER PRIMARY KEY NOT NULL,
-- A "process" is an execution of a single graph, starting from a number of roots
-- It corresponds to a single host process. Note that if multiple calls to
-- dprov.run() are provided within a single python script, multiple "processes"
-- nancy.run() are provided within a single python script, multiple "processes"
-- may be created. Also note that historical processes are automatically
-- imported and merged when possible when loading "data" from disk.
CREATE TABLE process (id INTEGER PRIMARY KEY NOT NULL,
@ -26,7 +26,7 @@ CREATE TABLE process (id INTEGER PRIMARY KEY NOT NULL,
end_time REAL,
process_id INTEGER, -- host PID of python process on host OS
environment INTEGER NOT NULL,
FOREIGN KEY (environment) REFERENCES environment (id) ON DELETE RESTRICT
FOREIGN KEY (environment) REFERENCES environment (id)
);
-- We try to track all python packages that impact execution by traversing a
-- copy of sys.modules. This is done once before a "process" and once after in
@ -35,7 +35,7 @@ CREATE TABLE py_package (id INTEGER PRIMARY KEY NOT NULL,
name TEXT NOT NULL,
version TEXT,
process INTEGER NOT NULL,
FOREIGN KEY (process) REFERENCES process (id) ON DELETE RESTRICT
FOREIGN KEY (process) REFERENCES process (id)
);
-- A py_module describes any python module file containing decorated Functions.
-- Modules are tracked since they impact the global scope of function calls.
@ -43,7 +43,7 @@ CREATE TABLE py_module(id INTEGER PRIMARY KEY NOT NULL,
name TEXT NOT NULL,
code TEXT,
package INTEGER,
FOREIGN KEY (package) REFERENCES py_package (id) ON DELETE RESTRICT
FOREIGN KEY (package) REFERENCES py_package (id)
);
-- A "func" just describes a function, without reference to its arguments. It
-- can have inputs and outputs, which are described in the func_inputs and
@ -55,20 +55,30 @@ CREATE TABLE func(id INTEGER PRIMARY KEY NOT NULL,
-- e.g. datetime.datetime.now().timestamp()
start_time REAL,
end_time REAL,
FOREIGN KEY (module) REFERENCES py_module (id) ON DELETE RESTRICT
FOREIGN KEY (module) REFERENCES py_module (id)
);
CREATE TABLE func_input(id INTEGER PRIMARY KEY NOT NULL,
name TEXT NOT NULL,
typename TEXT NOT NULL,
func INTEGER NOT NULL,
FOREIGN KEY (func) REFERENCES func (id) ON DELETE RESTRICT
position INTEGER,
posonly BOOL,
kwonly BOOL,
description TEXT,
FOREIGN KEY (func) REFERENCES func (id)
);
CREATE TABLE func_output(id INTEGER PRIMARY KEY NOT NULL,
name TEXT,
position INTEGER NOT NULL,
typename TEXT NOT NULL,
func INTEGER NOT NULL,
FOREIGN KEY (func) REFERENCES func (id) ON DELETE RESTRICT
position INTEGER NOT NULL,
description TEXT,
FOREIGN KEY (func) REFERENCES func (id)
);
@ -79,12 +89,12 @@ CREATE TABLE func_output(id INTEGER PRIMARY KEY NOT NULL,
CREATE TABLE task(id INTEGER PRIMARY KEY NOT NULL,
func INTEGER NOT NULL,
process INTEGER NOT NULL,
FOREIGN KEY (func) REFERENCES func (id) ON DELETE RESTRICT,
FOREIGN KEY (process) REFERENCES process (id) ON DELETE RESTRICT
FOREIGN KEY (func) REFERENCES func (id),
FOREIGN KEY (process) REFERENCES process (id)
);
-- A datum is an object that is computed as the output of a task, given as a
-- literal value in a config file, or loaded from a file.
CREATE TABLE datum(id INTEGER PRIMARY KEY NOT NULL,
CREATE TABLE data(id INTEGER PRIMARY KEY NOT NULL,
-- provider_type describes where the data came from. Choices are:
-- - COMPUTED: output of a decorated Function
-- - IMPORTED: imported output from a prior process
@ -100,7 +110,8 @@ CREATE TABLE datum(id INTEGER PRIMARY KEY NOT NULL,
typename TEXT NOT NULL, -- string representation of the data type
path TEXT, -- if residing on disk, the path relative to this db file
saved_path TEXT, -- if residing on disk, the path relative to this db file
saved_version TEXT, -- if residing on disk, the path relative to this db file
-- hash of provenance summarizes all dependencies, independent of environment
-- computed _before_ computation
@ -112,8 +123,8 @@ CREATE TABLE datum(id INTEGER PRIMARY KEY NOT NULL,
-- incremented
current_version INTEGER NOT NULL,
FOREIGN KEY (task) REFERENCES task (id) ON DELETE RESTRICT,
FOREIGN KEY (task_output) REFERENCES func_output (id) ON DELETE RESTRICT
FOREIGN KEY (task) REFERENCES task (id),
FOREIGN KEY (task_output) REFERENCES func_output (id)
);
-- A task input records the version of a Datum that is passed to the function
CREATE TABLE task_input(id INTEGER PRIMARY KEY NOT NULL,
@ -125,7 +136,7 @@ CREATE TABLE task_input(id INTEGER PRIMARY KEY NOT NULL,
-- is passed to a non-const operation, a new datum is provided
datum_version INTEGER NOT NULL,
FOREIGN KEY (task) REFERENCES task (id) ON DELETE RESTRICT,
FOREIGN KEY (func_input) REFERENCES func_input (id) ON DELETE RESTRICT,
FOREIGN KEY (datum) REFERENCES datum (id) ON DELETE RESTRICT
FOREIGN KEY (task) REFERENCES task (id),
FOREIGN KEY (func_input) REFERENCES func_input (id),
FOREIGN KEY (datum) REFERENCES datum (id)
);

View File

@ -1,21 +1,21 @@
[metadata]
name = DProv
name = nancy
author = Jacob Hinkle
author_email = hinklejd@ornl.gov
url = https://code.ornl.gov/4jh/dprov
url = https://code.ornl.gov/4jh/nancy
keywords = data-science workflows reproducibility provenance
version = attr: setuptools_scm.get_version
[tool:pytest]
testpaths = tests
addopts = --verbose --cov dprov/
addopts = --verbose --cov nancy/
[aliases]
test=pytest
[options]
packages =
superfun
nancy
install_requires =
dill
setup_requires =
@ -25,14 +25,18 @@ tests_require =
pytest
pytest-cov
pytest-runner
python_requires = >=3.6
python_requires = >=3.7
use_scm_version = True
[options.package_data]
nancy =
*.sql
[flake8]
exclude = .eggs,
.git,
__pycache__,
dprov.egg-info,
nancy.egg-info,
build,
dist
# allow a little bit longer lines than default (79)