diff --git a/src/nancy/schema.sql b/src/nancy/schema.sql index 51390cb..5d28841 100644 --- a/src/nancy/schema.sql +++ b/src/nancy/schema.sql @@ -2,56 +2,130 @@ PRAGMA user_version=0; -- version of this schema. -- ^^^^^ Increment the above whenever meaningful updates are made to this file. + PRAGMA foreign_keys=ON; +-- NOTE that ON UPDATE CASCADE is used for all foreign keys, in order to +-- facilitate easy importing of one database into another + + +-- We track host machines, so that we can give better information about file +-- locations. This also allows us to see whether platform-dependent behavior may +-- come into play. Note that these are not reliable identifiers for purposes of +-- tracking down data, but may be helpful context. +CREATE TABLE machine(id INTEGER PRIMARY KEY NOT NULL, + hostname TEXT, -- platform.node(): 'lucky' + processor TEXT, -- platform.processor(): + system TEXT, -- platform.system(): 'Linux' + release TEXT, -- platform.release(): '5.15.64' + machine TEXT, -- platform.machine(): 'x86_64' + timezone TEXT, -- timezone, for interpreting event times + freedesktop_os_release TEXT, -- requires python 3.10 + -- platform.freedesktop_os_release() as JSON + -- "{'NAME': 'NixOS', 'ID': 'nixos', + -- 'PRETTY_NAME': 'NixOS 22.05 (Quokka)', 'BUG_REPORT_URL': + -- 'https://github.com/NixOS/nixpkgs/issues', 'BUILD_ID': + -- '22.05.20220902.67e4507', 'DOCUMENTATION_URL': + -- 'https://nixos.org/learn.html', 'HOME_URL': 'https://nixos.org/', 'LOGO': + -- 'nix-snowflake', 'SUPPORT_URL': 'https://nixos.org/community.html', + -- 'VERSION': '22.05 (Quokka)', 'VERSION_CODENAME': 'quokka', 'VERSION_ID': + -- '22.05'}" + win32_ver TEXT, -- platform.win32_ver() as JSON + mac_ver TEXT -- platform.mac_ver() as JSON +); +-- Programs are run by users on machines +CREATE TABLE user(id INTEGER PRIMARY KEY NOT NULL, + username TEXT NOT NULL, + userid INTEGER, + fullname TEXT, + machine INTEGER NOT NULL, + + UNIQUE(userid, machine), + FOREIGN KEY (machine) REFERENCES machine (id) ON UPDATE CASCADE +); + -- Stores, directories, and files -- These are the primary objects tracked by nancy. --- A store is a directory containing a file called nancy.db +-- A store is a directory containing a file called nancy.db (e.g. the dir holding this database) CREATE TABLE store (id INTEGER PRIMARY KEY NOT NULL, - name TEXT, - dbpath TEXT NOT NULL + machine INTEGER, + dbpath TEXT NOT NULL, + imported BOOL, -- is this the store for the current directory? If not, then it was + -- imported in order to describe a dependency. + + FOREIGN KEY (machine) REFERENCES machine (id) ON UPDATE CASCADE ); -- Within the store are directories that contain files. +-- Note that there should be one entry with relpath='.' for the non-imported store. CREATE TABLE store_directory (id INTEGER PRIMARY KEY NOT NULL, + name TEXT, store INTEGER NOT NULL, - relpath TEXT, + parent INTEGER, -- parent directory (should be) in same store frozen BOOL NOT NULL, - FOREIGN KEY (store) REFERENCES store (id) + + UNIQUE(store, name, parent), + FOREIGN KEY (store) REFERENCES store (id) ON UPDATE CASCADE, + -- parent is a recursive key within this table. + FOREIGN KEY (parent) REFERENCES store_directory (id) ON UPDATE CASCADE ); +CREATE TRIGGER insert_store_directory BEFORE INSERT ON store_directory +BEGIN + SELECT + CASE + WHEN NEW.parent IS NOT NULL AND NEW.store != (SELECT store FROM store_directory WHERE id = NEW.parent) + THEN RAISE (ABORT, 'Parent directory resides in different store') + END; +END; +CREATE TRIGGER update_store_directory BEFORE UPDATE ON store_directory +BEGIN + SELECT + CASE + WHEN NEW.parent IS NOT NULL AND NEW.store != (SELECT store FROM store_directory WHERE id = NEW.parent) + THEN RAISE (ABORT, 'Parent directory resides in different store') + END; +END; + CREATE TABLE store_file (id INTEGER PRIMARY KEY NOT NULL, - directory INTEGER NOT NULL, - relpath TEXT, -- path relative to top-level of _store_, not dir + directory INTEGER NOT NULL, -- reference a directory inside a store + filename TEXT, -- only a filename, not a path frozen BOOL NOT NULL, + unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' source_datum INTEGER, - FOREIGN KEY (directory) REFERENCES store_directory (id) - FOREIGN KEY (source_datum) REFERENCES datum (id) + UNIQUE(filename, directory), + FOREIGN KEY (directory) REFERENCES store_directory (id) ON UPDATE CASCADE, + FOREIGN KEY (source_datum) REFERENCES datum (id) ON UPDATE CASCADE ); --- computational environment which can execute "runs" +-- A computational environment which can execute "processes". Note that the +-- python executable being used, and environment variables are recorded here. +-- Other info is available in the parent "machine" table. CREATE TABLE environment (id INTEGER PRIMARY KEY NOT NULL, - hostname TEXT, -- platform.node(): computer's network name envvars_json TEXT, -- json.dumps(dict(os.environ)) python_implementation TEXT, -- platform.python_implementation(): 'cpython' python_strversion TEXT, -- sys.version: '3.9.7 (default, Sep 16 2021, 13:09:58) \n[GCC 7.5.0]' python_hexversion INTEGER, -- sys.hexversion: 50923504 - python_host_platform TEXT -- platform.platform(): Linux-5.15.0-46-generic-x86_64-with-glibc2.31 + user INTEGER, + + FOREIGN KEY (user) REFERENCES user (id) ON UPDATE CASCADE ); --- A "process" is an execution of a single graph, starting from a number of roots --- It corresponds to a single host process. Note that if multiple calls to --- nancy.run() are provided within a single python script, multiple "processes" --- may be created. Also note that historical processes are automatically +-- A "program" is an execution of a single graph, starting from a number of roots +-- It corresponds to a single host parent process. Note that if multiple calls to +-- nancy.run() are provided within a single python script, multiple "programs" +-- may be created. Also note that historical programs are automatically -- imported and merged when possible when loading a "datum" from disk. -CREATE TABLE process (id INTEGER PRIMARY KEY NOT NULL, +CREATE TABLE program (id INTEGER PRIMARY KEY NOT NULL, + name TEXT, -- name of the program, usually written in code e.g. cnn_crossval -- we use POSIX timestamps in UTC for time recording. -- e.g. datetime.datetime.now().timestamp() start_time REAL, end_time REAL, process_id INTEGER, -- host PID of python process on host OS environment INTEGER NOT NULL, - FOREIGN KEY (environment) REFERENCES environment (id) + message TEXT, -- user-defined message to help distinguish similar runs + FOREIGN KEY (environment) REFERENCES environment (id) ON UPDATE CASCADE ); -- We try to track all python packages that impact execution by traversing a -- copy of sys.modules. This is done once before a "process" and once after in @@ -60,7 +134,7 @@ CREATE TABLE py_package (id INTEGER PRIMARY KEY NOT NULL, name TEXT NOT NULL, version TEXT, process INTEGER NOT NULL, - FOREIGN KEY (process) REFERENCES process (id) + FOREIGN KEY (process) REFERENCES process (id) ON UPDATE CASCADE ); -- A py_module describes any python module file containing decorated Functions. -- Modules are tracked since they impact the global scope of function calls. @@ -68,7 +142,7 @@ CREATE TABLE py_module(id INTEGER PRIMARY KEY NOT NULL, name TEXT NOT NULL, code TEXT, package INTEGER, - FOREIGN KEY (package) REFERENCES py_package (id) + FOREIGN KEY (package) REFERENCES py_package (id) ON UPDATE CASCADE ); -- A "func" just describes a function, without reference to its arguments. It -- can have inputs and outputs, which are described in the func_inputs and @@ -80,7 +154,7 @@ CREATE TABLE func(id INTEGER PRIMARY KEY NOT NULL, -- e.g. datetime.datetime.now().timestamp() start_time REAL, end_time REAL, - FOREIGN KEY (module) REFERENCES py_module (id) + FOREIGN KEY (module) REFERENCES py_module (id) ON UPDATE CASCADE ); CREATE TABLE func_input(id INTEGER PRIMARY KEY NOT NULL, name TEXT NOT NULL, @@ -93,7 +167,7 @@ CREATE TABLE func_input(id INTEGER PRIMARY KEY NOT NULL, description TEXT, - FOREIGN KEY (func) REFERENCES func (id) + FOREIGN KEY (func) REFERENCES func (id) ON UPDATE CASCADE ); CREATE TABLE func_output(id INTEGER PRIMARY KEY NOT NULL, name TEXT, @@ -103,7 +177,7 @@ CREATE TABLE func_output(id INTEGER PRIMARY KEY NOT NULL, position INTEGER NOT NULL, description TEXT, - FOREIGN KEY (func) REFERENCES func (id) + FOREIGN KEY (func) REFERENCES func (id) ON UPDATE CASCADE ); @@ -114,8 +188,8 @@ CREATE TABLE func_output(id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE task(id INTEGER PRIMARY KEY NOT NULL, func INTEGER NOT NULL, process INTEGER NOT NULL, - FOREIGN KEY (func) REFERENCES func (id), - FOREIGN KEY (process) REFERENCES process (id) + FOREIGN KEY (func) REFERENCES func (id) ON UPDATE CASCADE, + FOREIGN KEY (process) REFERENCES process (id) ON UPDATE CASCADE ); -- A datum is an object that is computed as the output of a task, given as a -- literal value in a config file, or loaded from a file. @@ -141,15 +215,15 @@ CREATE TABLE datum(id INTEGER PRIMARY KEY NOT NULL, -- hash of provenance summarizes all dependencies, independent of environment -- computed _before_ computation hash_provenance TEXT, - hash_content TEXT, -- hash of content + hash_content TEXT, -- hash of content is only computed when this datum is saved to disk -- Version starts at 0 when provided/first computed. When passed as -- a non-const input to a non-const function, this version recorded and -- incremented current_version INTEGER NOT NULL, - FOREIGN KEY (task) REFERENCES task (id), - FOREIGN KEY (task_output) REFERENCES func_output (id) + FOREIGN KEY (task) REFERENCES task (id) ON UPDATE CASCADE, + FOREIGN KEY (task_output) REFERENCES func_output (id) ON UPDATE CASCADE ); -- A task input records the version of a Datum that is passed to the function CREATE TABLE task_input(id INTEGER PRIMARY KEY NOT NULL, @@ -161,7 +235,7 @@ CREATE TABLE task_input(id INTEGER PRIMARY KEY NOT NULL, -- is passed to a non-const operation, a new datum is provided datum_version INTEGER NOT NULL, - FOREIGN KEY (task) REFERENCES task (id), - FOREIGN KEY (func_input) REFERENCES func_input (id), - FOREIGN KEY (datum) REFERENCES datum (id) + FOREIGN KEY (task) REFERENCES task (id) ON UPDATE CASCADE, + FOREIGN KEY (func_input) REFERENCES func_input (id) ON UPDATE CASCADE, + FOREIGN KEY (datum) REFERENCES datum (id) ON UPDATE CASCADE ); diff --git a/tests/test_db.py b/tests/test_db.py index a470c91..4911e0d 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -1,8 +1,203 @@ -def test_schema_version_match(): - import nancy - from nancy.version import schema_version +""" +Pure SQL tests that don't depend on nancy's Python code +""" +import pytest - cur = nancy._conn.cursor() - (db_schema_ver,) = cur.execute("PRAGMA user_version;").fetchone() +import os +import sqlite3 + + +@pytest.fixture +def db(): + """Create an in-memory database that follow's the nancy schema""" + conn = sqlite3.connect(':memory:') + cur = conn.cursor() + #import importlib + #schema = importlib.resources.open_text("nancy", "schema.sql").read() + schema = open(os.path.join( + os.path.dirname(__file__), + '../src/nancy/schema.sql', + ), 'r').read() + cur.executescript(schema) + yield cur + conn.close() + + +@pytest.fixture(autouse=True) +def insert_machine(db): + db.executemany( + 'INSERT INTO machine VALUES ' + '(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', + [( + None, + 'lucky', #hostname TEXT, -- platform.node(): 'lucky' + '', #processor TEXT, -- platform.processor(): + 'Linux', #system TEXT, -- platform.system(): 'Linux' + '5.15.64', #release TEXT, -- platform.release(): '5.15.64' + 'aarch64', #machine TEXT, -- platform.machine(): 'x86_64' + 'EDT', #timezone TEXT, -- timezone, for interpreting event times + '', #freedesktop_os_release TEXT, -- requires python 3.10 + '', #win32_ver TEXT, -- platform.win32_ver() as JSON + '', #mac_ver TEXT -- platform.mac_ver() as JSON + ), ( + None, + 'a100', #hostname TEXT, -- platform.node(): 'lucky' + '', #processor TEXT, -- platform.processor(): + 'Linux', #system TEXT, -- platform.system(): 'Linux' + '5.15.63', #release TEXT, -- platform.release(): '5.15.64' + 'x86_64', #machine TEXT, -- platform.machine(): 'x86_64' + 'EST', #timezone TEXT, -- timezone, for interpreting event times + '', #freedesktop_os_release TEXT, -- requires python 3.10 + '', #win32_ver TEXT, -- platform.win32_ver() as JSON + '', #mac_ver TEXT -- platform.mac_ver() as JSON + )], + ) +def test_insert_machine(db): + db.execute('SELECT * FROM machine') + machines = db.fetchall() + assert len(machines) == 2 + +@pytest.fixture(autouse=True) +def insert_user(db): + db.executemany( + 'INSERT INTO user VALUES ' + '(?, ?, ?, ?, ?)', + [( + None, + 'jacob', #username TEXT NOT NULL, + 101, #userid INTEGER, + 'Jacob Hinkle', #fullname TEXT, + 1, #machine INTEGER NOT NULL, + ), ( + None, + 'jacob', #username TEXT NOT NULL, + 10301, #userid INTEGER, + 'Jacob Hinkle', #fullname TEXT, + 2, #machine INTEGER NOT NULL, + ), ( + None, + 'bob', #username TEXT NOT NULL, + 2035, #userid INTEGER, + 'Just Bob', #fullname TEXT, + 2, #machine INTEGER NOT NULL, + )], + ) +def test_insert_user(db): + db.execute('SELECT * FROM user') + users = db.fetchall() + assert len(users) == 3 +def test_invalid_user_machine(db): + with pytest.raises(sqlite3.IntegrityError): + # should fail foreign key constraint + db.execute( + 'INSERT INTO user VALUES ' + '(?, ?, ?, ?, ?)', + ( + None, + 'bozo', #username TEXT NOT NULL, + 100, #userid INTEGER, + 'Bozo the Clown', #fullname TEXT, + 3, #machine INTEGER NOT NULL, + ), + ) + with pytest.raises(sqlite3.IntegrityError): + # should fail uniqueness constraint + db.execute( + 'INSERT INTO user VALUES ' + '(?, ?, ?, ?, ?)', + ( + None, + 'jacob', #username TEXT NOT NULL, + 101, #userid INTEGER, + 'Bozo the Clown', #fullname TEXT, + 1, #machine INTEGER NOT NULL, + ), + ) + + +@pytest.fixture(autouse=True) +def insert_store(db): + db.executemany( + 'INSERT INTO store VALUES ' + '(?, ?, ?, ?)', + [( + None, + 1, #machine INTEGER, + '/path/to/first/store', #dbpath TEXT NOT NULL, + False, #imported BOOL, + ), ( + None, + 1, #machine INTEGER, + '/path/to/dependencys/store', #dbpath TEXT NOT NULL, + True, #imported BOOL, + ), ( + None, + 2, #machine INTEGER, + # same path but on a separate machine + '/path/to/first/store', #dbpath TEXT NOT NULL, + True, #imported BOOL, + )], + ) +@pytest.fixture(autouse=True) +def insert_store_directory(db): + db.executemany( + 'INSERT INTO store_directory VALUES ' + '(?, ?, ?, ?, ?)', + [( + None, + '.', #name TEXT, + 1, #store INTEGER NOT NULL, + None, #parent INTEGER, -- parent directory (should be) in same store + False, #frozen BOOL NOT NULL, + ), ( + None, + 'foo', #name TEXT, + 1, #store INTEGER NOT NULL, + 1, #parent INTEGER, -- parent directory (should be) in same store + False, #frozen BOOL NOT NULL, + ), ( + None, + '.', #name TEXT, + 2, #store INTEGER NOT NULL, + None, #parent INTEGER, -- parent directory (should be) in same store + False, #frozen BOOL NOT NULL, + )], + ) + +def test_crossstore_directory_insert(db): + with pytest.raises(sqlite3.IntegrityError): + # declaring directory as belonging to store 2, but parent's store is 1 + db.execute( + 'INSERT INTO store_directory VALUES ' + '(?, ?, ?, ?, ?)', + ( + None, + '.', #name TEXT, + 2, #store INTEGER NOT NULL, + 1, #parent INTEGER, -- parent directory (should be) in same store + False, #frozen BOOL NOT NULL, + )) + + +@pytest.fixture(autouse=True) +def insert_store_file(db): + db.executemany( + 'INSERT INTO store_file VALUES ' + '(?, ?, ?, ?, ?, ?)', + [( + None, + 1, #directory INTEGER NOT NULL + 'example.csv', #filename TEXT + 1, #frozen BOOL NOT NULL + '-rw-rw-r--', #unfrozen_perms TEXT + None, #source_datum INTEGER, + ), ( + None, + 2, #directory INTEGER NOT NULL + 'plots.png', #filename TEXT + 1, #frozen BOOL NOT NULL + '-rw-r--r--', #unfrozen_perms TEXT + None, #source_datum INTEGER, + )] + ) - assert schema_version == db_schema_ver diff --git a/tests/test_store.py b/tests/test_store.py new file mode 100644 index 0000000..e25c1ee --- /dev/null +++ b/tests/test_store.py @@ -0,0 +1,19 @@ +import pytest + +@pytest.fixture +def store(): + from nancy import store + + s = store.Store.init() + yield s + + + +def test_schema_version_match(store): + from nancy.version import schema_version + + cur = store.conn.cursor() + (db_schema_ver,) = cur.execute("PRAGMA user_version;").fetchone() + + assert schema_version == db_schema_ver +