Update schema and add tests which don't rely on python

This commit is contained in:
Jacob Hinkle 2022-09-20 12:07:32 -04:00
parent 9e7af6b8a1
commit 311990d5f6
3 changed files with 325 additions and 37 deletions

View File

@ -2,56 +2,130 @@
PRAGMA user_version=0; -- version of this schema. PRAGMA user_version=0; -- version of this schema.
-- ^^^^^ Increment the above whenever meaningful updates are made to this file. -- ^^^^^ Increment the above whenever meaningful updates are made to this file.
PRAGMA foreign_keys=ON; PRAGMA foreign_keys=ON;
-- NOTE that ON UPDATE CASCADE is used for all foreign keys, in order to
-- facilitate easy importing of one database into another
-- We track host machines, so that we can give better information about file
-- locations. This also allows us to see whether platform-dependent behavior may
-- come into play. Note that these are not reliable identifiers for purposes of
-- tracking down data, but may be helpful context.
CREATE TABLE machine(id INTEGER PRIMARY KEY NOT NULL,
hostname TEXT, -- platform.node(): 'lucky'
processor TEXT, -- platform.processor():
system TEXT, -- platform.system(): 'Linux'
release TEXT, -- platform.release(): '5.15.64'
machine TEXT, -- platform.machine(): 'x86_64'
timezone TEXT, -- timezone, for interpreting event times
freedesktop_os_release TEXT, -- requires python 3.10
-- platform.freedesktop_os_release() as JSON
-- "{'NAME': 'NixOS', 'ID': 'nixos',
-- 'PRETTY_NAME': 'NixOS 22.05 (Quokka)', 'BUG_REPORT_URL':
-- 'https://github.com/NixOS/nixpkgs/issues', 'BUILD_ID':
-- '22.05.20220902.67e4507', 'DOCUMENTATION_URL':
-- 'https://nixos.org/learn.html', 'HOME_URL': 'https://nixos.org/', 'LOGO':
-- 'nix-snowflake', 'SUPPORT_URL': 'https://nixos.org/community.html',
-- 'VERSION': '22.05 (Quokka)', 'VERSION_CODENAME': 'quokka', 'VERSION_ID':
-- '22.05'}"
win32_ver TEXT, -- platform.win32_ver() as JSON
mac_ver TEXT -- platform.mac_ver() as JSON
);
-- Programs are run by users on machines
CREATE TABLE user(id INTEGER PRIMARY KEY NOT NULL,
username TEXT NOT NULL,
userid INTEGER,
fullname TEXT,
machine INTEGER NOT NULL,
UNIQUE(userid, machine),
FOREIGN KEY (machine) REFERENCES machine (id) ON UPDATE CASCADE
);
-- Stores, directories, and files -- Stores, directories, and files
-- These are the primary objects tracked by nancy. -- These are the primary objects tracked by nancy.
-- A store is a directory containing a file called nancy.db -- A store is a directory containing a file called nancy.db (e.g. the dir holding this database)
CREATE TABLE store (id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE store (id INTEGER PRIMARY KEY NOT NULL,
name TEXT, machine INTEGER,
dbpath TEXT NOT NULL dbpath TEXT NOT NULL,
imported BOOL, -- is this the store for the current directory? If not, then it was
-- imported in order to describe a dependency.
FOREIGN KEY (machine) REFERENCES machine (id) ON UPDATE CASCADE
); );
-- Within the store are directories that contain files. -- Within the store are directories that contain files.
-- Note that there should be one entry with relpath='.' for the non-imported store.
CREATE TABLE store_directory (id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE store_directory (id INTEGER PRIMARY KEY NOT NULL,
name TEXT,
store INTEGER NOT NULL, store INTEGER NOT NULL,
relpath TEXT, parent INTEGER, -- parent directory (should be) in same store
frozen BOOL NOT NULL, frozen BOOL NOT NULL,
FOREIGN KEY (store) REFERENCES store (id)
UNIQUE(store, name, parent),
FOREIGN KEY (store) REFERENCES store (id) ON UPDATE CASCADE,
-- parent is a recursive key within this table.
FOREIGN KEY (parent) REFERENCES store_directory (id) ON UPDATE CASCADE
); );
CREATE TRIGGER insert_store_directory BEFORE INSERT ON store_directory
BEGIN
SELECT
CASE
WHEN NEW.parent IS NOT NULL AND NEW.store != (SELECT store FROM store_directory WHERE id = NEW.parent)
THEN RAISE (ABORT, 'Parent directory resides in different store')
END;
END;
CREATE TRIGGER update_store_directory BEFORE UPDATE ON store_directory
BEGIN
SELECT
CASE
WHEN NEW.parent IS NOT NULL AND NEW.store != (SELECT store FROM store_directory WHERE id = NEW.parent)
THEN RAISE (ABORT, 'Parent directory resides in different store')
END;
END;
CREATE TABLE store_file (id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE store_file (id INTEGER PRIMARY KEY NOT NULL,
directory INTEGER NOT NULL, directory INTEGER NOT NULL, -- reference a directory inside a store
relpath TEXT, -- path relative to top-level of _store_, not dir filename TEXT, -- only a filename, not a path
frozen BOOL NOT NULL, frozen BOOL NOT NULL,
unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--'
source_datum INTEGER, source_datum INTEGER,
FOREIGN KEY (directory) REFERENCES store_directory (id) UNIQUE(filename, directory),
FOREIGN KEY (source_datum) REFERENCES datum (id) FOREIGN KEY (directory) REFERENCES store_directory (id) ON UPDATE CASCADE,
FOREIGN KEY (source_datum) REFERENCES datum (id) ON UPDATE CASCADE
); );
-- computational environment which can execute "runs" -- A computational environment which can execute "processes". Note that the
-- python executable being used, and environment variables are recorded here.
-- Other info is available in the parent "machine" table.
CREATE TABLE environment (id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE environment (id INTEGER PRIMARY KEY NOT NULL,
hostname TEXT, -- platform.node(): computer's network name
envvars_json TEXT, -- json.dumps(dict(os.environ)) envvars_json TEXT, -- json.dumps(dict(os.environ))
python_implementation TEXT, -- platform.python_implementation(): 'cpython' python_implementation TEXT, -- platform.python_implementation(): 'cpython'
python_strversion TEXT, -- sys.version: '3.9.7 (default, Sep 16 2021, 13:09:58) \n[GCC 7.5.0]' python_strversion TEXT, -- sys.version: '3.9.7 (default, Sep 16 2021, 13:09:58) \n[GCC 7.5.0]'
python_hexversion INTEGER, -- sys.hexversion: 50923504 python_hexversion INTEGER, -- sys.hexversion: 50923504
python_host_platform TEXT -- platform.platform(): Linux-5.15.0-46-generic-x86_64-with-glibc2.31 user INTEGER,
FOREIGN KEY (user) REFERENCES user (id) ON UPDATE CASCADE
); );
-- A "process" is an execution of a single graph, starting from a number of roots -- A "program" is an execution of a single graph, starting from a number of roots
-- It corresponds to a single host process. Note that if multiple calls to -- It corresponds to a single host parent process. Note that if multiple calls to
-- nancy.run() are provided within a single python script, multiple "processes" -- nancy.run() are provided within a single python script, multiple "programs"
-- may be created. Also note that historical processes are automatically -- may be created. Also note that historical programs are automatically
-- imported and merged when possible when loading a "datum" from disk. -- imported and merged when possible when loading a "datum" from disk.
CREATE TABLE process (id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE program (id INTEGER PRIMARY KEY NOT NULL,
name TEXT, -- name of the program, usually written in code e.g. cnn_crossval
-- we use POSIX timestamps in UTC for time recording. -- we use POSIX timestamps in UTC for time recording.
-- e.g. datetime.datetime.now().timestamp() -- e.g. datetime.datetime.now().timestamp()
start_time REAL, start_time REAL,
end_time REAL, end_time REAL,
process_id INTEGER, -- host PID of python process on host OS process_id INTEGER, -- host PID of python process on host OS
environment INTEGER NOT NULL, environment INTEGER NOT NULL,
FOREIGN KEY (environment) REFERENCES environment (id) message TEXT, -- user-defined message to help distinguish similar runs
FOREIGN KEY (environment) REFERENCES environment (id) ON UPDATE CASCADE
); );
-- We try to track all python packages that impact execution by traversing a -- We try to track all python packages that impact execution by traversing a
-- copy of sys.modules. This is done once before a "process" and once after in -- copy of sys.modules. This is done once before a "process" and once after in
@ -60,7 +134,7 @@ CREATE TABLE py_package (id INTEGER PRIMARY KEY NOT NULL,
name TEXT NOT NULL, name TEXT NOT NULL,
version TEXT, version TEXT,
process INTEGER NOT NULL, process INTEGER NOT NULL,
FOREIGN KEY (process) REFERENCES process (id) FOREIGN KEY (process) REFERENCES process (id) ON UPDATE CASCADE
); );
-- A py_module describes any python module file containing decorated Functions. -- A py_module describes any python module file containing decorated Functions.
-- Modules are tracked since they impact the global scope of function calls. -- Modules are tracked since they impact the global scope of function calls.
@ -68,7 +142,7 @@ CREATE TABLE py_module(id INTEGER PRIMARY KEY NOT NULL,
name TEXT NOT NULL, name TEXT NOT NULL,
code TEXT, code TEXT,
package INTEGER, package INTEGER,
FOREIGN KEY (package) REFERENCES py_package (id) FOREIGN KEY (package) REFERENCES py_package (id) ON UPDATE CASCADE
); );
-- A "func" just describes a function, without reference to its arguments. It -- A "func" just describes a function, without reference to its arguments. It
-- can have inputs and outputs, which are described in the func_inputs and -- can have inputs and outputs, which are described in the func_inputs and
@ -80,7 +154,7 @@ CREATE TABLE func(id INTEGER PRIMARY KEY NOT NULL,
-- e.g. datetime.datetime.now().timestamp() -- e.g. datetime.datetime.now().timestamp()
start_time REAL, start_time REAL,
end_time REAL, end_time REAL,
FOREIGN KEY (module) REFERENCES py_module (id) FOREIGN KEY (module) REFERENCES py_module (id) ON UPDATE CASCADE
); );
CREATE TABLE func_input(id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE func_input(id INTEGER PRIMARY KEY NOT NULL,
name TEXT NOT NULL, name TEXT NOT NULL,
@ -93,7 +167,7 @@ CREATE TABLE func_input(id INTEGER PRIMARY KEY NOT NULL,
description TEXT, description TEXT,
FOREIGN KEY (func) REFERENCES func (id) FOREIGN KEY (func) REFERENCES func (id) ON UPDATE CASCADE
); );
CREATE TABLE func_output(id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE func_output(id INTEGER PRIMARY KEY NOT NULL,
name TEXT, name TEXT,
@ -103,7 +177,7 @@ CREATE TABLE func_output(id INTEGER PRIMARY KEY NOT NULL,
position INTEGER NOT NULL, position INTEGER NOT NULL,
description TEXT, description TEXT,
FOREIGN KEY (func) REFERENCES func (id) FOREIGN KEY (func) REFERENCES func (id) ON UPDATE CASCADE
); );
@ -114,8 +188,8 @@ CREATE TABLE func_output(id INTEGER PRIMARY KEY NOT NULL,
CREATE TABLE task(id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE task(id INTEGER PRIMARY KEY NOT NULL,
func INTEGER NOT NULL, func INTEGER NOT NULL,
process INTEGER NOT NULL, process INTEGER NOT NULL,
FOREIGN KEY (func) REFERENCES func (id), FOREIGN KEY (func) REFERENCES func (id) ON UPDATE CASCADE,
FOREIGN KEY (process) REFERENCES process (id) FOREIGN KEY (process) REFERENCES process (id) ON UPDATE CASCADE
); );
-- A datum is an object that is computed as the output of a task, given as a -- A datum is an object that is computed as the output of a task, given as a
-- literal value in a config file, or loaded from a file. -- literal value in a config file, or loaded from a file.
@ -141,15 +215,15 @@ CREATE TABLE datum(id INTEGER PRIMARY KEY NOT NULL,
-- hash of provenance summarizes all dependencies, independent of environment -- hash of provenance summarizes all dependencies, independent of environment
-- computed _before_ computation -- computed _before_ computation
hash_provenance TEXT, hash_provenance TEXT,
hash_content TEXT, -- hash of content hash_content TEXT, -- hash of content is only computed when this datum is saved to disk
-- Version starts at 0 when provided/first computed. When passed as -- Version starts at 0 when provided/first computed. When passed as
-- a non-const input to a non-const function, this version recorded and -- a non-const input to a non-const function, this version recorded and
-- incremented -- incremented
current_version INTEGER NOT NULL, current_version INTEGER NOT NULL,
FOREIGN KEY (task) REFERENCES task (id), FOREIGN KEY (task) REFERENCES task (id) ON UPDATE CASCADE,
FOREIGN KEY (task_output) REFERENCES func_output (id) FOREIGN KEY (task_output) REFERENCES func_output (id) ON UPDATE CASCADE
); );
-- A task input records the version of a Datum that is passed to the function -- A task input records the version of a Datum that is passed to the function
CREATE TABLE task_input(id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE task_input(id INTEGER PRIMARY KEY NOT NULL,
@ -161,7 +235,7 @@ CREATE TABLE task_input(id INTEGER PRIMARY KEY NOT NULL,
-- is passed to a non-const operation, a new datum is provided -- is passed to a non-const operation, a new datum is provided
datum_version INTEGER NOT NULL, datum_version INTEGER NOT NULL,
FOREIGN KEY (task) REFERENCES task (id), FOREIGN KEY (task) REFERENCES task (id) ON UPDATE CASCADE,
FOREIGN KEY (func_input) REFERENCES func_input (id), FOREIGN KEY (func_input) REFERENCES func_input (id) ON UPDATE CASCADE,
FOREIGN KEY (datum) REFERENCES datum (id) FOREIGN KEY (datum) REFERENCES datum (id) ON UPDATE CASCADE
); );

View File

@ -1,8 +1,203 @@
def test_schema_version_match(): """
import nancy Pure SQL tests that don't depend on nancy's Python code
from nancy.version import schema_version """
import pytest
cur = nancy._conn.cursor() import os
(db_schema_ver,) = cur.execute("PRAGMA user_version;").fetchone() import sqlite3
@pytest.fixture
def db():
"""Create an in-memory database that follow's the nancy schema"""
conn = sqlite3.connect(':memory:')
cur = conn.cursor()
#import importlib
#schema = importlib.resources.open_text("nancy", "schema.sql").read()
schema = open(os.path.join(
os.path.dirname(__file__),
'../src/nancy/schema.sql',
), 'r').read()
cur.executescript(schema)
yield cur
conn.close()
@pytest.fixture(autouse=True)
def insert_machine(db):
db.executemany(
'INSERT INTO machine VALUES '
'(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',
[(
None,
'lucky', #hostname TEXT, -- platform.node(): 'lucky'
'', #processor TEXT, -- platform.processor():
'Linux', #system TEXT, -- platform.system(): 'Linux'
'5.15.64', #release TEXT, -- platform.release(): '5.15.64'
'aarch64', #machine TEXT, -- platform.machine(): 'x86_64'
'EDT', #timezone TEXT, -- timezone, for interpreting event times
'', #freedesktop_os_release TEXT, -- requires python 3.10
'', #win32_ver TEXT, -- platform.win32_ver() as JSON
'', #mac_ver TEXT -- platform.mac_ver() as JSON
), (
None,
'a100', #hostname TEXT, -- platform.node(): 'lucky'
'', #processor TEXT, -- platform.processor():
'Linux', #system TEXT, -- platform.system(): 'Linux'
'5.15.63', #release TEXT, -- platform.release(): '5.15.64'
'x86_64', #machine TEXT, -- platform.machine(): 'x86_64'
'EST', #timezone TEXT, -- timezone, for interpreting event times
'', #freedesktop_os_release TEXT, -- requires python 3.10
'', #win32_ver TEXT, -- platform.win32_ver() as JSON
'', #mac_ver TEXT -- platform.mac_ver() as JSON
)],
)
def test_insert_machine(db):
db.execute('SELECT * FROM machine')
machines = db.fetchall()
assert len(machines) == 2
@pytest.fixture(autouse=True)
def insert_user(db):
db.executemany(
'INSERT INTO user VALUES '
'(?, ?, ?, ?, ?)',
[(
None,
'jacob', #username TEXT NOT NULL,
101, #userid INTEGER,
'Jacob Hinkle', #fullname TEXT,
1, #machine INTEGER NOT NULL,
), (
None,
'jacob', #username TEXT NOT NULL,
10301, #userid INTEGER,
'Jacob Hinkle', #fullname TEXT,
2, #machine INTEGER NOT NULL,
), (
None,
'bob', #username TEXT NOT NULL,
2035, #userid INTEGER,
'Just Bob', #fullname TEXT,
2, #machine INTEGER NOT NULL,
)],
)
def test_insert_user(db):
db.execute('SELECT * FROM user')
users = db.fetchall()
assert len(users) == 3
def test_invalid_user_machine(db):
with pytest.raises(sqlite3.IntegrityError):
# should fail foreign key constraint
db.execute(
'INSERT INTO user VALUES '
'(?, ?, ?, ?, ?)',
(
None,
'bozo', #username TEXT NOT NULL,
100, #userid INTEGER,
'Bozo the Clown', #fullname TEXT,
3, #machine INTEGER NOT NULL,
),
)
with pytest.raises(sqlite3.IntegrityError):
# should fail uniqueness constraint
db.execute(
'INSERT INTO user VALUES '
'(?, ?, ?, ?, ?)',
(
None,
'jacob', #username TEXT NOT NULL,
101, #userid INTEGER,
'Bozo the Clown', #fullname TEXT,
1, #machine INTEGER NOT NULL,
),
)
@pytest.fixture(autouse=True)
def insert_store(db):
db.executemany(
'INSERT INTO store VALUES '
'(?, ?, ?, ?)',
[(
None,
1, #machine INTEGER,
'/path/to/first/store', #dbpath TEXT NOT NULL,
False, #imported BOOL,
), (
None,
1, #machine INTEGER,
'/path/to/dependencys/store', #dbpath TEXT NOT NULL,
True, #imported BOOL,
), (
None,
2, #machine INTEGER,
# same path but on a separate machine
'/path/to/first/store', #dbpath TEXT NOT NULL,
True, #imported BOOL,
)],
)
@pytest.fixture(autouse=True)
def insert_store_directory(db):
db.executemany(
'INSERT INTO store_directory VALUES '
'(?, ?, ?, ?, ?)',
[(
None,
'.', #name TEXT,
1, #store INTEGER NOT NULL,
None, #parent INTEGER, -- parent directory (should be) in same store
False, #frozen BOOL NOT NULL,
), (
None,
'foo', #name TEXT,
1, #store INTEGER NOT NULL,
1, #parent INTEGER, -- parent directory (should be) in same store
False, #frozen BOOL NOT NULL,
), (
None,
'.', #name TEXT,
2, #store INTEGER NOT NULL,
None, #parent INTEGER, -- parent directory (should be) in same store
False, #frozen BOOL NOT NULL,
)],
)
def test_crossstore_directory_insert(db):
with pytest.raises(sqlite3.IntegrityError):
# declaring directory as belonging to store 2, but parent's store is 1
db.execute(
'INSERT INTO store_directory VALUES '
'(?, ?, ?, ?, ?)',
(
None,
'.', #name TEXT,
2, #store INTEGER NOT NULL,
1, #parent INTEGER, -- parent directory (should be) in same store
False, #frozen BOOL NOT NULL,
))
@pytest.fixture(autouse=True)
def insert_store_file(db):
db.executemany(
'INSERT INTO store_file VALUES '
'(?, ?, ?, ?, ?, ?)',
[(
None,
1, #directory INTEGER NOT NULL
'example.csv', #filename TEXT
1, #frozen BOOL NOT NULL
'-rw-rw-r--', #unfrozen_perms TEXT
None, #source_datum INTEGER,
), (
None,
2, #directory INTEGER NOT NULL
'plots.png', #filename TEXT
1, #frozen BOOL NOT NULL
'-rw-r--r--', #unfrozen_perms TEXT
None, #source_datum INTEGER,
)]
)
assert schema_version == db_schema_ver

19
tests/test_store.py Normal file
View File

@ -0,0 +1,19 @@
import pytest
@pytest.fixture
def store():
from nancy import store
s = store.Store.init()
yield s
def test_schema_version_match(store):
from nancy.version import schema_version
cur = store.conn.cursor()
(db_schema_ver,) = cur.execute("PRAGMA user_version;").fetchone()
assert schema_version == db_schema_ver