nancy/dprov/store/schema.sql

132 lines
5.6 KiB
SQL

-- Schema file for dprov.db
PRAGMA user_version=0; -- version of this schema.
-- ^^^^^ Increment the above whenever meaningful updates are made to this file.
PRAGMA foreign_keys=ON;
-- computational environment which can execute "runs"
CREATE TABLE environment (id INTEGER PRIMARY KEY NOT NULL,
hostname TEXT, -- platform.node(): 'lucky'
envvars_json TEXT, -- json.dumps(dict(os.environ))
python_implementation TEXT, -- platform.python_implementation(): 'cpython'
python_strversion TEXT, -- sys.version: '3.9.7 (default, Sep 16 2021, 13:09:58) \n[GCC 7.5.0]'
python_hexversion INTEGER, -- sys.hexversion: 50923504
python_host_platform TEXT -- platform.platform(): Linux-5.15.0-46-generic-x86_64-with-glibc2.31
);
-- A "process" is an execution of a single graph, starting from a number of roots
-- It corresponds to a single host process. Note that if multiple calls to
-- dprov.run() are provided within a single python script, multiple "processes"
-- may be created. Also note that historical processes are automatically
-- imported and merged when possible when loading "data" from disk.
CREATE TABLE process (id INTEGER PRIMARY KEY NOT NULL,
-- we use POSIX timestamps in UTC for time recording.
-- e.g. datetime.datetime.now().timestamp()
start_time REAL,
end_time REAL,
process_id INTEGER, -- host PID of python process on host OS
environment INTEGER NOT NULL,
FOREIGN KEY (environment) REFERENCES environment (id) ON DELETE RESTRICT
);
-- We try to track all python packages that impact execution by traversing a
-- copy of sys.modules. This is done once before a "process" and once after in
-- case some calling code winds up calling a previously-unloaded module.
CREATE TABLE py_package (id INTEGER PRIMARY KEY NOT NULL,
name TEXT NOT NULL,
version TEXT,
process INTEGER NOT NULL,
FOREIGN KEY (process) REFERENCES process (id) ON DELETE RESTRICT
);
-- A py_module describes any python module file containing decorated Functions.
-- Modules are tracked since they impact the global scope of function calls.
CREATE TABLE py_module(id INTEGER PRIMARY KEY NOT NULL,
name TEXT NOT NULL,
code TEXT,
package INTEGER,
FOREIGN KEY (package) REFERENCES py_package (id) ON DELETE RESTRICT
);
-- A "func" just describes a function, without reference to its arguments. It
-- can have inputs and outputs, which are described in the func_inputs and
-- func_outputs children tables.
CREATE TABLE func(id INTEGER PRIMARY KEY NOT NULL,
name TEXT NOT NULL,
module INTEGER NOT NULL,
-- we use POSIX timestamps in UTC for time recording.
-- e.g. datetime.datetime.now().timestamp()
start_time REAL,
end_time REAL,
FOREIGN KEY (module) REFERENCES py_module (id) ON DELETE RESTRICT
);
CREATE TABLE func_input(id INTEGER PRIMARY KEY NOT NULL,
name TEXT NOT NULL,
typename TEXT NOT NULL,
func INTEGER NOT NULL,
FOREIGN KEY (func) REFERENCES func (id) ON DELETE RESTRICT
);
CREATE TABLE func_output(id INTEGER PRIMARY KEY NOT NULL,
name TEXT,
position INTEGER NOT NULL,
typename TEXT NOT NULL,
func INTEGER NOT NULL,
FOREIGN KEY (func) REFERENCES func (id) ON DELETE RESTRICT
);
-- Tasks are executed calls to Functions: they correspond to a particular set of
-- inputs which are themselves data (see datum table). A task is executed in
-- the context of a "process". Within a process, tasks are typically evaluated
-- in a serial manner.
CREATE TABLE task(id INTEGER PRIMARY KEY NOT NULL,
func INTEGER NOT NULL,
process INTEGER NOT NULL,
FOREIGN KEY (func) REFERENCES func (id) ON DELETE RESTRICT,
FOREIGN KEY (process) REFERENCES process (id) ON DELETE RESTRICT
);
-- A datum is an object that is computed as the output of a task, given as a
-- literal value in a config file, or loaded from a file.
CREATE TABLE datum(id INTEGER PRIMARY KEY NOT NULL,
-- provider_type describes where the data came from. Choices are:
-- - COMPUTED: output of a decorated Function
-- - IMPORTED: imported output from a prior process
-- - LITERAL: literal value provided in a config file
-- Note that literals are implicitly imported (hash computed) at runtime
provider_type TEXT,
-- task and task_output are only used when provider_type='COMPUTED'
task INTEGER,
task_output INTEGER,
literal_json TEXT, -- if provider_type='LITERAL', a JSON representation of the value
typename TEXT NOT NULL, -- string representation of the data type
path TEXT, -- if residing on disk, the path relative to this db file
-- hash of provenance summarizes all dependencies, independent of environment
-- computed _before_ computation
hash_provenance TEXT,
hash_content TEXT, -- hash of content
-- Version starts at 0 when provided/first computed. When passed as
-- a non-const input to a non-const function, this version recorded and
-- incremented
current_version INTEGER NOT NULL,
FOREIGN KEY (task) REFERENCES task (id) ON DELETE RESTRICT,
FOREIGN KEY (task_output) REFERENCES func_output (id) ON DELETE RESTRICT
);
-- A task input records the version of a Datum that is passed to the function
CREATE TABLE task_input(id INTEGER PRIMARY KEY NOT NULL,
task INTEGER NOT NULL,
func_input INTEGER NOT NULL,
datum INTEGER NOT NULL,
-- Data have versions to facilitate tacking non-const operations. If a datum
-- is passed to a non-const operation, a new datum is provided
datum_version INTEGER NOT NULL,
FOREIGN KEY (task) REFERENCES task (id) ON DELETE RESTRICT,
FOREIGN KEY (func_input) REFERENCES func_input (id) ON DELETE RESTRICT,
FOREIGN KEY (datum) REFERENCES datum (id) ON DELETE RESTRICT
);