132 lines
5.6 KiB
SQL
132 lines
5.6 KiB
SQL
-- Schema file for dprov.db
|
|
PRAGMA user_version=0; -- version of this schema.
|
|
-- ^^^^^ Increment the above whenever meaningful updates are made to this file.
|
|
|
|
PRAGMA foreign_keys=ON;
|
|
|
|
-- computational environment which can execute "runs"
|
|
CREATE TABLE environment (id INTEGER PRIMARY KEY NOT NULL,
|
|
hostname TEXT, -- platform.node(): 'lucky'
|
|
envvars_json TEXT, -- json.dumps(dict(os.environ))
|
|
python_implementation TEXT, -- platform.python_implementation(): 'cpython'
|
|
python_strversion TEXT, -- sys.version: '3.9.7 (default, Sep 16 2021, 13:09:58) \n[GCC 7.5.0]'
|
|
python_hexversion INTEGER, -- sys.hexversion: 50923504
|
|
python_host_platform TEXT -- platform.platform(): Linux-5.15.0-46-generic-x86_64-with-glibc2.31
|
|
);
|
|
|
|
-- A "process" is an execution of a single graph, starting from a number of roots
|
|
-- It corresponds to a single host process. Note that if multiple calls to
|
|
-- dprov.run() are provided within a single python script, multiple "processes"
|
|
-- may be created. Also note that historical processes are automatically
|
|
-- imported and merged when possible when loading "data" from disk.
|
|
CREATE TABLE process (id INTEGER PRIMARY KEY NOT NULL,
|
|
-- we use POSIX timestamps in UTC for time recording.
|
|
-- e.g. datetime.datetime.now().timestamp()
|
|
start_time REAL,
|
|
end_time REAL,
|
|
process_id INTEGER, -- host PID of python process on host OS
|
|
environment INTEGER NOT NULL,
|
|
FOREIGN KEY (environment) REFERENCES environment (id) ON DELETE RESTRICT
|
|
);
|
|
-- We try to track all python packages that impact execution by traversing a
|
|
-- copy of sys.modules. This is done once before a "process" and once after in
|
|
-- case some calling code winds up calling a previously-unloaded module.
|
|
CREATE TABLE py_package (id INTEGER PRIMARY KEY NOT NULL,
|
|
name TEXT NOT NULL,
|
|
version TEXT,
|
|
process INTEGER NOT NULL,
|
|
FOREIGN KEY (process) REFERENCES process (id) ON DELETE RESTRICT
|
|
);
|
|
-- A py_module describes any python module file containing decorated Functions.
|
|
-- Modules are tracked since they impact the global scope of function calls.
|
|
CREATE TABLE py_module(id INTEGER PRIMARY KEY NOT NULL,
|
|
name TEXT NOT NULL,
|
|
code TEXT,
|
|
package INTEGER,
|
|
FOREIGN KEY (package) REFERENCES py_package (id) ON DELETE RESTRICT
|
|
);
|
|
-- A "func" just describes a function, without reference to its arguments. It
|
|
-- can have inputs and outputs, which are described in the func_inputs and
|
|
-- func_outputs children tables.
|
|
CREATE TABLE func(id INTEGER PRIMARY KEY NOT NULL,
|
|
name TEXT NOT NULL,
|
|
module INTEGER NOT NULL,
|
|
-- we use POSIX timestamps in UTC for time recording.
|
|
-- e.g. datetime.datetime.now().timestamp()
|
|
start_time REAL,
|
|
end_time REAL,
|
|
FOREIGN KEY (module) REFERENCES py_module (id) ON DELETE RESTRICT
|
|
);
|
|
CREATE TABLE func_input(id INTEGER PRIMARY KEY NOT NULL,
|
|
name TEXT NOT NULL,
|
|
typename TEXT NOT NULL,
|
|
func INTEGER NOT NULL,
|
|
FOREIGN KEY (func) REFERENCES func (id) ON DELETE RESTRICT
|
|
);
|
|
CREATE TABLE func_output(id INTEGER PRIMARY KEY NOT NULL,
|
|
name TEXT,
|
|
position INTEGER NOT NULL,
|
|
typename TEXT NOT NULL,
|
|
func INTEGER NOT NULL,
|
|
FOREIGN KEY (func) REFERENCES func (id) ON DELETE RESTRICT
|
|
);
|
|
|
|
|
|
-- Tasks are executed calls to Functions: they correspond to a particular set of
|
|
-- inputs which are themselves data (see datum table). A task is executed in
|
|
-- the context of a "process". Within a process, tasks are typically evaluated
|
|
-- in a serial manner.
|
|
CREATE TABLE task(id INTEGER PRIMARY KEY NOT NULL,
|
|
func INTEGER NOT NULL,
|
|
process INTEGER NOT NULL,
|
|
FOREIGN KEY (func) REFERENCES func (id) ON DELETE RESTRICT,
|
|
FOREIGN KEY (process) REFERENCES process (id) ON DELETE RESTRICT
|
|
);
|
|
-- A datum is an object that is computed as the output of a task, given as a
|
|
-- literal value in a config file, or loaded from a file.
|
|
CREATE TABLE datum(id INTEGER PRIMARY KEY NOT NULL,
|
|
-- provider_type describes where the data came from. Choices are:
|
|
-- - COMPUTED: output of a decorated Function
|
|
-- - IMPORTED: imported output from a prior process
|
|
-- - LITERAL: literal value provided in a config file
|
|
-- Note that literals are implicitly imported (hash computed) at runtime
|
|
provider_type TEXT,
|
|
|
|
-- task and task_output are only used when provider_type='COMPUTED'
|
|
task INTEGER,
|
|
task_output INTEGER,
|
|
|
|
literal_json TEXT, -- if provider_type='LITERAL', a JSON representation of the value
|
|
|
|
typename TEXT NOT NULL, -- string representation of the data type
|
|
|
|
path TEXT, -- if residing on disk, the path relative to this db file
|
|
|
|
-- hash of provenance summarizes all dependencies, independent of environment
|
|
-- computed _before_ computation
|
|
hash_provenance TEXT,
|
|
hash_content TEXT, -- hash of content
|
|
|
|
-- Version starts at 0 when provided/first computed. When passed as
|
|
-- a non-const input to a non-const function, this version recorded and
|
|
-- incremented
|
|
current_version INTEGER NOT NULL,
|
|
|
|
FOREIGN KEY (task) REFERENCES task (id) ON DELETE RESTRICT,
|
|
FOREIGN KEY (task_output) REFERENCES func_output (id) ON DELETE RESTRICT
|
|
);
|
|
-- A task input records the version of a Datum that is passed to the function
|
|
CREATE TABLE task_input(id INTEGER PRIMARY KEY NOT NULL,
|
|
task INTEGER NOT NULL,
|
|
func_input INTEGER NOT NULL,
|
|
|
|
datum INTEGER NOT NULL,
|
|
-- Data have versions to facilitate tacking non-const operations. If a datum
|
|
-- is passed to a non-const operation, a new datum is provided
|
|
datum_version INTEGER NOT NULL,
|
|
|
|
FOREIGN KEY (task) REFERENCES task (id) ON DELETE RESTRICT,
|
|
FOREIGN KEY (func_input) REFERENCES func_input (id) ON DELETE RESTRICT,
|
|
FOREIGN KEY (datum) REFERENCES datum (id) ON DELETE RESTRICT
|
|
);
|