Add first draft of schema, setuptools, gitignore
This commit is contained in:
commit
6858489b0b
113
.gitignore
vendored
Normal file
113
.gitignore
vendored
Normal file
@ -0,0 +1,113 @@
|
||||
# Created by https://www.gitignore.io/api/python
|
||||
|
||||
### Python ###
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# celery beat schedule file
|
||||
celerybeat-schedule
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
|
||||
### Python Patch ###
|
||||
.venv/
|
||||
|
||||
|
||||
# End of https://www.gitignore.io/api/python
|
||||
131
dprov/store/schema.sql
Normal file
131
dprov/store/schema.sql
Normal file
@ -0,0 +1,131 @@
|
||||
-- Schema file for dprov.db
|
||||
PRAGMA user_version=0; -- version of this schema.
|
||||
-- ^^^^^ Increment the above whenever meaningful updates are made to this file.
|
||||
|
||||
PRAGMA foreign_keys=ON;
|
||||
|
||||
-- computational environment which can execute "runs"
|
||||
CREATE TABLE environment (id INTEGER PRIMARY KEY NOT NULL,
|
||||
hostname TEXT, -- platform.node(): 'lucky'
|
||||
envvars_json TEXT, -- json.dumps(dict(os.environ))
|
||||
python_implementation TEXT, -- platform.python_implementation(): 'cpython'
|
||||
python_strversion TEXT, -- sys.version: '3.9.7 (default, Sep 16 2021, 13:09:58) \n[GCC 7.5.0]'
|
||||
python_hexversion INTEGER, -- sys.hexversion: 50923504
|
||||
python_host_platform TEXT -- platform.platform(): Linux-5.15.0-46-generic-x86_64-with-glibc2.31
|
||||
);
|
||||
|
||||
-- A "process" is an execution of a single graph, starting from a number of roots
|
||||
-- It corresponds to a single host process. Note that if multiple calls to
|
||||
-- dprov.run() are provided within a single python script, multiple "processes"
|
||||
-- may be created. Also note that historical processes are automatically
|
||||
-- imported and merged when possible when loading "data" from disk.
|
||||
CREATE TABLE process (id INTEGER PRIMARY KEY NOT NULL,
|
||||
-- we use POSIX timestamps in UTC for time recording.
|
||||
-- e.g. datetime.datetime.now().timestamp()
|
||||
start_time REAL,
|
||||
end_time REAL,
|
||||
process_id INTEGER, -- host PID of python process on host OS
|
||||
environment INTEGER NOT NULL,
|
||||
FOREIGN KEY (environment) REFERENCES environment (id) ON DELETE RESTRICT
|
||||
);
|
||||
-- We try to track all python packages that impact execution by traversing a
|
||||
-- copy of sys.modules. This is done once before a "process" and once after in
|
||||
-- case some calling code winds up calling a previously-unloaded module.
|
||||
CREATE TABLE py_package (id INTEGER PRIMARY KEY NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
version TEXT,
|
||||
process INTEGER NOT NULL,
|
||||
FOREIGN KEY (process) REFERENCES process (id) ON DELETE RESTRICT
|
||||
);
|
||||
-- A py_module describes any python module file containing decorated Functions.
|
||||
-- Modules are tracked since they impact the global scope of function calls.
|
||||
CREATE TABLE py_module(id INTEGER PRIMARY KEY NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
code TEXT,
|
||||
package INTEGER,
|
||||
FOREIGN KEY (package) REFERENCES py_package (id) ON DELETE RESTRICT
|
||||
);
|
||||
-- A "func" just describes a function, without reference to its arguments. It
|
||||
-- can have inputs and outputs, which are described in the func_inputs and
|
||||
-- func_outputs children tables.
|
||||
CREATE TABLE func(id INTEGER PRIMARY KEY NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
module INTEGER NOT NULL,
|
||||
-- we use POSIX timestamps in UTC for time recording.
|
||||
-- e.g. datetime.datetime.now().timestamp()
|
||||
start_time REAL,
|
||||
end_time REAL,
|
||||
FOREIGN KEY (module) REFERENCES py_module (id) ON DELETE RESTRICT
|
||||
);
|
||||
CREATE TABLE func_input(id INTEGER PRIMARY KEY NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
typename TEXT NOT NULL,
|
||||
func INTEGER NOT NULL,
|
||||
FOREIGN KEY (func) REFERENCES func (id) ON DELETE RESTRICT
|
||||
);
|
||||
CREATE TABLE func_output(id INTEGER PRIMARY KEY NOT NULL,
|
||||
name TEXT,
|
||||
position INTEGER NOT NULL,
|
||||
typename TEXT NOT NULL,
|
||||
func INTEGER NOT NULL,
|
||||
FOREIGN KEY (func) REFERENCES func (id) ON DELETE RESTRICT
|
||||
);
|
||||
|
||||
|
||||
-- Tasks are executed calls to Functions: they correspond to a particular set of
|
||||
-- inputs which are themselves data (see datum table). A task is executed in
|
||||
-- the context of a "process". Within a process, tasks are typically evaluated
|
||||
-- in a serial manner.
|
||||
CREATE TABLE task(id INTEGER PRIMARY KEY NOT NULL,
|
||||
func INTEGER NOT NULL,
|
||||
process INTEGER NOT NULL,
|
||||
FOREIGN KEY (func) REFERENCES func (id) ON DELETE RESTRICT,
|
||||
FOREIGN KEY (process) REFERENCES process (id) ON DELETE RESTRICT
|
||||
);
|
||||
-- A datum is an object that is computed as the output of a task, given as a
|
||||
-- literal value in a config file, or loaded from a file.
|
||||
CREATE TABLE datum(id INTEGER PRIMARY KEY NOT NULL,
|
||||
-- provider_type describes where the data came from. Choices are:
|
||||
-- - COMPUTED: output of a decorated Function
|
||||
-- - IMPORTED: imported output from a prior process
|
||||
-- - LITERAL: literal value provided in a config file
|
||||
-- Note that literals are implicitly imported (hash computed) at runtime
|
||||
provider_type TEXT,
|
||||
|
||||
-- task and task_output are only used when provider_type='COMPUTED'
|
||||
task INTEGER,
|
||||
task_output INTEGER,
|
||||
|
||||
literal_json TEXT, -- if provider_type='LITERAL', a JSON representation of the value
|
||||
|
||||
typename TEXT NOT NULL, -- string representation of the data type
|
||||
|
||||
path TEXT, -- if residing on disk, the path relative to this db file
|
||||
|
||||
-- hash of provenance summarizes all dependencies, independent of environment
|
||||
-- computed _before_ computation
|
||||
hash_provenance TEXT,
|
||||
hash_content TEXT, -- hash of content
|
||||
|
||||
-- Version starts at 0 when provided/first computed. When passed as
|
||||
-- a non-const input to a non-const function, this version recorded and
|
||||
-- incremented
|
||||
current_version INTEGER NOT NULL,
|
||||
|
||||
FOREIGN KEY (task) REFERENCES task (id) ON DELETE RESTRICT,
|
||||
FOREIGN KEY (task_output) REFERENCES func_output (id) ON DELETE RESTRICT
|
||||
);
|
||||
-- A task input records the version of a Datum that is passed to the function
|
||||
CREATE TABLE task_input(id INTEGER PRIMARY KEY NOT NULL,
|
||||
task INTEGER NOT NULL,
|
||||
func_input INTEGER NOT NULL,
|
||||
|
||||
datum INTEGER NOT NULL,
|
||||
-- Data have versions to facilitate tacking non-const operations. If a datum
|
||||
-- is passed to a non-const operation, a new datum is provided
|
||||
datum_version INTEGER NOT NULL,
|
||||
|
||||
FOREIGN KEY (task) REFERENCES task (id) ON DELETE RESTRICT,
|
||||
FOREIGN KEY (func_input) REFERENCES func_input (id) ON DELETE RESTRICT,
|
||||
FOREIGN KEY (datum) REFERENCES datum (id) ON DELETE RESTRICT
|
||||
);
|
||||
41
setup.cfg
Normal file
41
setup.cfg
Normal file
@ -0,0 +1,41 @@
|
||||
[metadata]
|
||||
name = DProv
|
||||
author = Jacob Hinkle
|
||||
author_email = hinklejd@ornl.gov
|
||||
url = https://code.ornl.gov/4jh/dprov
|
||||
keywords = data-science workflows reproducibility provenance
|
||||
version = attr: setuptools_scm.get_version
|
||||
|
||||
[tool:pytest]
|
||||
testpaths = tests
|
||||
addopts = --verbose --cov dprov/
|
||||
|
||||
[aliases]
|
||||
test=pytest
|
||||
|
||||
[options]
|
||||
packages =
|
||||
superfun
|
||||
install_requires =
|
||||
dill
|
||||
setup_requires =
|
||||
setuptools_scm
|
||||
tests_require =
|
||||
coverage
|
||||
pytest
|
||||
pytest-cov
|
||||
pytest-runner
|
||||
python_requires = >=3.6
|
||||
use_scm_version = True
|
||||
|
||||
[flake8]
|
||||
exclude = .eggs,
|
||||
.git,
|
||||
__pycache__,
|
||||
dprov.egg-info,
|
||||
build,
|
||||
dist
|
||||
# allow a little bit longer lines than default (79)
|
||||
ignore = W504
|
||||
max-line-length= 99
|
||||
show-source = True
|
||||
Loading…
x
Reference in New Issue
Block a user