commit 6858489b0b4ddfe751d63a986caf97f929bb36d7 Author: Jacob Hinkle Date: Fri Sep 16 15:04:57 2022 -0400 Add first draft of schema, setuptools, gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a1adbbe --- /dev/null +++ b/.gitignore @@ -0,0 +1,113 @@ +# Created by https://www.gitignore.io/api/python + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +### Python Patch ### +.venv/ + + +# End of https://www.gitignore.io/api/python diff --git a/dprov/store/schema.sql b/dprov/store/schema.sql new file mode 100644 index 0000000..5e3d129 --- /dev/null +++ b/dprov/store/schema.sql @@ -0,0 +1,131 @@ +-- Schema file for dprov.db +PRAGMA user_version=0; -- version of this schema. +-- ^^^^^ Increment the above whenever meaningful updates are made to this file. + +PRAGMA foreign_keys=ON; + +-- computational environment which can execute "runs" +CREATE TABLE environment (id INTEGER PRIMARY KEY NOT NULL, + hostname TEXT, -- platform.node(): 'lucky' + envvars_json TEXT, -- json.dumps(dict(os.environ)) + python_implementation TEXT, -- platform.python_implementation(): 'cpython' + python_strversion TEXT, -- sys.version: '3.9.7 (default, Sep 16 2021, 13:09:58) \n[GCC 7.5.0]' + python_hexversion INTEGER, -- sys.hexversion: 50923504 + python_host_platform TEXT -- platform.platform(): Linux-5.15.0-46-generic-x86_64-with-glibc2.31 +); + +-- A "process" is an execution of a single graph, starting from a number of roots +-- It corresponds to a single host process. Note that if multiple calls to +-- dprov.run() are provided within a single python script, multiple "processes" +-- may be created. Also note that historical processes are automatically +-- imported and merged when possible when loading "data" from disk. +CREATE TABLE process (id INTEGER PRIMARY KEY NOT NULL, + -- we use POSIX timestamps in UTC for time recording. + -- e.g. datetime.datetime.now().timestamp() + start_time REAL, + end_time REAL, + process_id INTEGER, -- host PID of python process on host OS + environment INTEGER NOT NULL, + FOREIGN KEY (environment) REFERENCES environment (id) ON DELETE RESTRICT +); +-- We try to track all python packages that impact execution by traversing a +-- copy of sys.modules. This is done once before a "process" and once after in +-- case some calling code winds up calling a previously-unloaded module. +CREATE TABLE py_package (id INTEGER PRIMARY KEY NOT NULL, + name TEXT NOT NULL, + version TEXT, + process INTEGER NOT NULL, + FOREIGN KEY (process) REFERENCES process (id) ON DELETE RESTRICT +); +-- A py_module describes any python module file containing decorated Functions. +-- Modules are tracked since they impact the global scope of function calls. +CREATE TABLE py_module(id INTEGER PRIMARY KEY NOT NULL, + name TEXT NOT NULL, + code TEXT, + package INTEGER, + FOREIGN KEY (package) REFERENCES py_package (id) ON DELETE RESTRICT +); +-- A "func" just describes a function, without reference to its arguments. It +-- can have inputs and outputs, which are described in the func_inputs and +-- func_outputs children tables. +CREATE TABLE func(id INTEGER PRIMARY KEY NOT NULL, + name TEXT NOT NULL, + module INTEGER NOT NULL, + -- we use POSIX timestamps in UTC for time recording. + -- e.g. datetime.datetime.now().timestamp() + start_time REAL, + end_time REAL, + FOREIGN KEY (module) REFERENCES py_module (id) ON DELETE RESTRICT +); +CREATE TABLE func_input(id INTEGER PRIMARY KEY NOT NULL, + name TEXT NOT NULL, + typename TEXT NOT NULL, + func INTEGER NOT NULL, + FOREIGN KEY (func) REFERENCES func (id) ON DELETE RESTRICT +); +CREATE TABLE func_output(id INTEGER PRIMARY KEY NOT NULL, + name TEXT, + position INTEGER NOT NULL, + typename TEXT NOT NULL, + func INTEGER NOT NULL, + FOREIGN KEY (func) REFERENCES func (id) ON DELETE RESTRICT +); + + +-- Tasks are executed calls to Functions: they correspond to a particular set of +-- inputs which are themselves data (see datum table). A task is executed in +-- the context of a "process". Within a process, tasks are typically evaluated +-- in a serial manner. +CREATE TABLE task(id INTEGER PRIMARY KEY NOT NULL, + func INTEGER NOT NULL, + process INTEGER NOT NULL, + FOREIGN KEY (func) REFERENCES func (id) ON DELETE RESTRICT, + FOREIGN KEY (process) REFERENCES process (id) ON DELETE RESTRICT +); +-- A datum is an object that is computed as the output of a task, given as a +-- literal value in a config file, or loaded from a file. +CREATE TABLE datum(id INTEGER PRIMARY KEY NOT NULL, + -- provider_type describes where the data came from. Choices are: + -- - COMPUTED: output of a decorated Function + -- - IMPORTED: imported output from a prior process + -- - LITERAL: literal value provided in a config file + -- Note that literals are implicitly imported (hash computed) at runtime + provider_type TEXT, + + -- task and task_output are only used when provider_type='COMPUTED' + task INTEGER, + task_output INTEGER, + + literal_json TEXT, -- if provider_type='LITERAL', a JSON representation of the value + + typename TEXT NOT NULL, -- string representation of the data type + + path TEXT, -- if residing on disk, the path relative to this db file + + -- hash of provenance summarizes all dependencies, independent of environment + -- computed _before_ computation + hash_provenance TEXT, + hash_content TEXT, -- hash of content + + -- Version starts at 0 when provided/first computed. When passed as + -- a non-const input to a non-const function, this version recorded and + -- incremented + current_version INTEGER NOT NULL, + + FOREIGN KEY (task) REFERENCES task (id) ON DELETE RESTRICT, + FOREIGN KEY (task_output) REFERENCES func_output (id) ON DELETE RESTRICT +); +-- A task input records the version of a Datum that is passed to the function +CREATE TABLE task_input(id INTEGER PRIMARY KEY NOT NULL, + task INTEGER NOT NULL, + func_input INTEGER NOT NULL, + + datum INTEGER NOT NULL, + -- Data have versions to facilitate tacking non-const operations. If a datum + -- is passed to a non-const operation, a new datum is provided + datum_version INTEGER NOT NULL, + + FOREIGN KEY (task) REFERENCES task (id) ON DELETE RESTRICT, + FOREIGN KEY (func_input) REFERENCES func_input (id) ON DELETE RESTRICT, + FOREIGN KEY (datum) REFERENCES datum (id) ON DELETE RESTRICT +); diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..384435e --- /dev/null +++ b/setup.cfg @@ -0,0 +1,41 @@ +[metadata] +name = DProv +author = Jacob Hinkle +author_email = hinklejd@ornl.gov +url = https://code.ornl.gov/4jh/dprov +keywords = data-science workflows reproducibility provenance +version = attr: setuptools_scm.get_version + +[tool:pytest] +testpaths = tests +addopts = --verbose --cov dprov/ + +[aliases] +test=pytest + +[options] +packages = + superfun +install_requires = + dill +setup_requires = + setuptools_scm +tests_require = + coverage + pytest + pytest-cov + pytest-runner +python_requires = >=3.6 +use_scm_version = True + +[flake8] +exclude = .eggs, + .git, + __pycache__, + dprov.egg-info, + build, + dist +# allow a little bit longer lines than default (79) +ignore = W504 +max-line-length= 99 +show-source = True diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..6068493 --- /dev/null +++ b/setup.py @@ -0,0 +1,3 @@ +from setuptools import setup + +setup()