From fb6a54c6ef0fb5738a6619ba3d33ee4bf7d519a2 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 22 Sep 2022 15:47:49 -0400 Subject: [PATCH] Clean up schema and add filedir_versions table --- src/nancy/schema/version0.sql | 179 ++++++++++++++++++++++------------ tests/test_db.py | 73 +++++--------- 2 files changed, 143 insertions(+), 109 deletions(-) diff --git a/src/nancy/schema/version0.sql b/src/nancy/schema/version0.sql index 9cc25ee..9bfe3d1 100644 --- a/src/nancy/schema/version0.sql +++ b/src/nancy/schema/version0.sql @@ -8,6 +8,21 @@ PRAGMA foreign_keys=ON; -- facilitate easy importing of one database into another +-- This lets us enable or disable triggers. For example, when importing we need +-- to update some tables to account for changing primary keys using ON UPDATE +-- CASCADE. However, some dependent tables like filedir may have a trigger that +-- prevents all updates: e.g. for deleted files. In this case, we need to be +-- able to disable that one specific trigger during imports, then re-enable it. +-- We would do so by updating this table and setting enabled=False/True. +-- +-- Note that entries in this table only need to be inserted when we know we have +-- a need to disable the trigger. Otherwise, this table can be ignored. +CREATE TABLE triggers( + name TEXT PRIMARY KEY, + enabled BOOL +); + + -- We track host machines, so that we can give better information about file -- locations. This also allows us to see whether platform-dependent behavior may -- come into play. Note that these are not reliable identifiers for purposes of @@ -79,42 +94,90 @@ CREATE TABLE filedir (id INTEGER PRIMARY KEY NOT NULL, filename TEXT, -- only a filename, not a path parent INTEGER REFERENCES filedir ON UPDATE CASCADE, - frozen BOOL NOT NULL, filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details - unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' - symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target + -- Note that changes in filetype are seen as a delete/replace instead of an + -- update. If such a change is noticed, the original filedir entry must be + -- marked deleted and a new one created with the new filetype. - sha256 TEXT NOT NULL, - - source_program INTEGER, -- Note that this is redundant since datum points to a program... - source_datum INTEGER, + frozen BOOL NOT NULL, + deleted BOOL NOT NULL, -- set True when recording a deleted file + -- NOTE: a deleted file should not be updated, other than through a cascade UNIQUE(store, filename, parent), - - FOREIGN KEY (store) REFERENCES store (id) ON UPDATE CASCADE, - FOREIGN KEY (source_datum) REFERENCES datum (id) ON UPDATE CASCADE, - FOREIGN KEY (source_program) REFERENCES program (id) ON UPDATE CASCADE + FOREIGN KEY (store) REFERENCES store (id) ON UPDATE CASCADE ); CREATE TRIGGER insert_filedir BEFORE INSERT ON filedir -BEGIN - SELECT - CASE - WHEN NEW.parent IS NOT NULL AND (SELECT filetype FROM filedir WHERE id = NEW.parent) != 'DIR' +BEGIN SELECT CASE + WHEN NEW.parent IS NOT NULL AND (SELECT filetype FROM filedir WHERE id = NEW.parent) != 'DIR' THEN RAISE (ABORT, 'Parent is not listed as a directory') - WHEN NEW.parent IS NOT NULL AND NEW.store != (SELECT store FROM filedir WHERE id = NEW.parent) + WHEN NEW.parent IS NOT NULL AND NEW.store != (SELECT store FROM filedir WHERE id = NEW.parent) THEN RAISE (ABORT, 'Parent directory resides in different store') - END; -END; +END; END; CREATE TRIGGER update_filedir BEFORE UPDATE ON filedir -BEGIN - SELECT - CASE - WHEN NEW.parent IS NOT NULL AND (SELECT filetype FROM filedir WHERE id = NEW.parent) != 'DIR' +BEGIN SELECT CASE + WHEN NEW.parent IS NOT NULL AND (SELECT filetype FROM filedir WHERE id = NEW.parent) != 'DIR' THEN RAISE (ABORT, 'Parent is not listed as a directory') - WHEN NEW.parent IS NOT NULL AND NEW.store != (SELECT store FROM filedir WHERE id = NEW.parent) + WHEN NEW.parent IS NOT NULL AND NEW.store != (SELECT store FROM filedir WHERE id = NEW.parent) THEN RAISE (ABORT, 'Parent directory resides in different store') - END; -END; +END; END; +-- Files that are recorded to be deleted have the deleted column set to True. +-- These files should no longer be used. In particular, we should never rename +-- or update them since their lifetime is over. +INSERT INTO triggers VALUES('update_deleted_filedir', TRUE); +CREATE TRIGGER update_deleted_filedir BEFORE UPDATE ON filedir +BEGIN SELECT CASE + WHEN OLD.deleted AND (SELECT enabled FROM triggers WHERE name = 'update_deleted_filedir') + THEN RAISE (ABORT, 'Cannot modify filedir entry for deleted file') +END; END; + + +-- This table holds _versions_ of files. This table holds information that is +-- independent of the location within the filesystem and merely indicates a +-- version of the content in a format that is nearly independent of filetype +-- (though the computation of content hashes is of course dependent on +-- filetype). Each version has a number, and was provided by some program (and +-- potentially a datum). +CREATE TABLE filedir_version (id INTEGER PRIMARY KEY NOT NULL, + filedir INTEGER REFERENCES filedir ON UPDATE CASCADE, -- parent filedir entry + version INTEGER NOT NULL, -- incremented whenever a new version of the file is recorded + + -- We record the permissions on each file, in a way that enables reloading + -- permissions properly when thawing after a freeze operation. + unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' + + symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target. I.e. this is the "content" of the symlink. + + -- The following hash can be NULL if the file was deleted. It could also be + -- null if hashing is deferred. Hash deferring would be preferred in cases + -- where a program must use a file or directory for multiple tasks, and each + -- task might mutate the directory. In these cases, each task will add an + -- additional version, but we do not want to hash every version since that + -- would needlessly slow down execution, for very little benefit since we + -- would be recording hashes for versions that may not be final. Instead, we + -- can defer by only hashing files and directories at the end of a program. + sha256 TEXT, + + source_task INTEGER, + + UNIQUE(filedir, version), + FOREIGN KEY (source_task) REFERENCES task (id) ON UPDATE CASCADE +); +-- Disallow inserting or deleting versions for deleted files. (the version +-- deleting the file should be added first, then the deleted field set on the +-- filedir row). The update trigger is toggleable to enable importing. +CREATE TRIGGER insert_filedir_version_parent_deleted BEFORE INSERT ON filedir_version +BEGIN SELECT CASE + WHEN (SELECT deleted FROM filedir WHERE id = NEW.parent) + THEN RAISE (ABORT, 'Attempted to insert filedir_version whose filedir was deleted') +END; END; +INSERT INTO triggers VALUES('update_filedir_version_parent_deleted', TRUE); +CREATE TRIGGER update_filedir_version_parent_deleted BEFORE UPDATE ON filedir_version +BEGIN SELECT CASE + WHEN (SELECT deleted FROM filedir WHERE id = NEW.parent) + AND (SELECT enabled FROM triggers WHERE name = 'update_filedir_version_parent_deleted') + THEN RAISE (ABORT, 'Attempted to update filedir_version whose parent was deleted') +END; END; + -- A computational environment which can execute "programs". Note that the -- python executable being used, and environment variables are recorded here. @@ -160,34 +223,35 @@ CREATE TABLE program (id INTEGER PRIMARY KEY NOT NULL, -- case some calling code winds up calling a previously-unloaded module. CREATE TABLE py_package (id INTEGER PRIMARY KEY NOT NULL, name TEXT NOT NULL, - version TEXT, - program INTEGER NOT NULL, - FOREIGN KEY (program) REFERENCES program (id) ON UPDATE CASCADE + version TEXT ); -- A py_module describes any python module file containing decorated Functions. -- Modules are tracked since they impact the global scope of function calls. CREATE TABLE py_module(id INTEGER PRIMARY KEY NOT NULL, name TEXT NOT NULL, code TEXT, - package INTEGER, - FOREIGN KEY (package) REFERENCES py_package (id) ON UPDATE CASCADE + py_package INTEGER, + + FOREIGN KEY (py_package) REFERENCES py_package (id) ON UPDATE CASCADE ); --- A "func" just describes a function, without reference to its arguments. It --- can have inputs and outputs, which are described in the func_inputs and +-- A py_function just describes a function, without reference to its arguments. +-- It can have inputs and outputs, which are described in the func_inputs and -- func_outputs children tables. -CREATE TABLE func(id INTEGER PRIMARY KEY NOT NULL, +CREATE TABLE py_function(id INTEGER PRIMARY KEY NOT NULL, name TEXT NOT NULL, - module INTEGER NOT NULL, + py_module INTEGER NOT NULL, + -- we use POSIX timestamps for time recording. -- e.g. datetime.datetime.now().timestamp() start_time REAL, end_time REAL, - FOREIGN KEY (module) REFERENCES py_module (id) ON UPDATE CASCADE + + FOREIGN KEY (py_module) REFERENCES py_module (id) ON UPDATE CASCADE ); -CREATE TABLE func_input(id INTEGER PRIMARY KEY NOT NULL, +CREATE TABLE py_function_input(id INTEGER PRIMARY KEY NOT NULL, name TEXT NOT NULL, typename TEXT NOT NULL, - func INTEGER NOT NULL, + py_function INTEGER NOT NULL, position INTEGER, posonly BOOL, @@ -195,28 +259,30 @@ CREATE TABLE func_input(id INTEGER PRIMARY KEY NOT NULL, description TEXT, - FOREIGN KEY (func) REFERENCES func (id) ON UPDATE CASCADE + FOREIGN KEY (py_function) REFERENCES py_function (id) ON UPDATE CASCADE ); -CREATE TABLE func_output(id INTEGER PRIMARY KEY NOT NULL, +CREATE TABLE py_function_output(id INTEGER PRIMARY KEY NOT NULL, name TEXT, typename TEXT NOT NULL, - func INTEGER NOT NULL, + py_function INTEGER NOT NULL, position INTEGER NOT NULL, description TEXT, - FOREIGN KEY (func) REFERENCES func (id) ON UPDATE CASCADE + FOREIGN KEY (py_function) REFERENCES py_function (id) ON UPDATE CASCADE ); --- Tasks are executed calls to Functions: they correspond to a particular set of --- inputs which are themselves data (see datum table). A task is executed in --- the context of a "program". Within a program, tasks are typically evaluated --- in a serial manner. +-- Tasks are usually executed calls to Functions: they correspond to a +-- particular set of inputs which are themselves data (see datum table). A task +-- is executed in the context of a "program". Within a program, tasks are +-- typically evaluated in a serial manner. CREATE TABLE task(id INTEGER PRIMARY KEY NOT NULL, - func INTEGER NOT NULL, program INTEGER NOT NULL, - FOREIGN KEY (func) REFERENCES func (id) ON UPDATE CASCADE, + + py_function INTEGER, -- func is NULL for some built-in functionality like "record" programs + + FOREIGN KEY (py_function) REFERENCES py_function (id) ON UPDATE CASCADE, FOREIGN KEY (program) REFERENCES program (id) ON UPDATE CASCADE ); -- A datum is an object that is computed as the output of a task, given as a @@ -237,26 +303,13 @@ CREATE TABLE datum(id INTEGER PRIMARY KEY NOT NULL, typename TEXT NOT NULL, -- string representation of the data type - saved_path TEXT, -- if residing on disk, the path relative to this db file - saved_version TEXT, -- if residing on disk, the path relative to this db file - - -- hash of provenance summarizes all dependencies, independent of environment - -- computed _before_ computation - hash_provenance TEXT, - hash_content TEXT, -- hash of content is only computed when this datum is saved to disk - - -- Version starts at 0 when provided/first computed. When passed as - -- a non-const input to a non-const function, this version recorded and - -- incremented - current_version INTEGER NOT NULL, - FOREIGN KEY (task) REFERENCES task (id) ON UPDATE CASCADE, FOREIGN KEY (task_output) REFERENCES func_output (id) ON UPDATE CASCADE ); --- A task input records the version of a Datum that is passed to the function +-- A task_input records the version of a Datum that is passed to a function CREATE TABLE task_input(id INTEGER PRIMARY KEY NOT NULL, task INTEGER NOT NULL, - func_input INTEGER NOT NULL, + py_function_input INTEGER, -- if this was a python function, reference which input datum INTEGER NOT NULL, -- Data have versions to facilitate tracking non-const operations. If a datum @@ -265,6 +318,6 @@ CREATE TABLE task_input(id INTEGER PRIMARY KEY NOT NULL, datum_version INTEGER NOT NULL, FOREIGN KEY (task) REFERENCES task (id) ON UPDATE CASCADE, - FOREIGN KEY (func_input) REFERENCES func_input (id) ON UPDATE CASCADE, + FOREIGN KEY (py_function_input) REFERENCES py_function_input (id) ON UPDATE CASCADE, FOREIGN KEY (datum) REFERENCES datum (id) ON UPDATE CASCADE ); diff --git a/tests/test_db.py b/tests/test_db.py index fcfe07d..589e8e3 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -152,43 +152,31 @@ def insert_directories(insert_store): cur = insert_store cur.executemany( 'INSERT INTO filedir VALUES ' - '(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', + '(?, ?, ?, ?, ?, ?, ?)', [( None, #id INTEGER PRIMARY KEY NOT NULL, 1, #store INTEGER NOT NULL, '.', #filename TEXT, -- only a filename, not a path None, #parent INTEGER REFERENCES filedir ON UPDATE CASCADE, - False, #frozen BOOL NOT NULL, 'DIR', #filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details - 'dr-xr-xr--', #unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' - None, #symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target - 'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855', #sha256 TEXT NOT NULL, - None, #source_program INTEGER, -- Note that this is redundant since datum points to a program... - None, #source_datum INTEGER, + False, #frozen BOOL NOT NULL, + False, #deleted BOOL NOT NULL, ), ( None, #id INTEGER PRIMARY KEY NOT NULL, 1, #store INTEGER NOT NULL, 'foo', #filename TEXT, -- only a filename, not a path 1, #parent INTEGER REFERENCES filedir ON UPDATE CASCADE, - False, #frozen BOOL NOT NULL, 'DIR', #filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details - 'dr-xr-xr--', #unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' - None, #symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target - '5ad4e9e3090de8de781e4a35ce6ba16ad8eaba7a5456cd230a36f77143885396', #sha256 TEXT NOT NULL, - None, #source_program INTEGER, -- Note that this is redundant since datum points to a program... - None, #source_datum INTEGER, + False, #frozen BOOL NOT NULL, + False, #deleted BOOL NOT NULL, ), ( None, #id INTEGER PRIMARY KEY NOT NULL, 2, #store INTEGER NOT NULL, '.', #filename TEXT, -- only a filename, not a path None, #parent INTEGER REFERENCES filedir ON UPDATE CASCADE, - False, #frozen BOOL NOT NULL, 'DIR', #filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details - 'dr-xr-xr--', #unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' - None, #symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target - '15c5e8d80a48803c18e72cd274532d608b8026dcbc192afc490fe1c289ec6ff1', #sha256 TEXT NOT NULL, - None, #source_program INTEGER, -- Note that this is redundant since datum points to a program... - None, #source_datum INTEGER, + False, #frozen BOOL NOT NULL, + False, #deleted BOOL NOT NULL, )], ) return cur @@ -199,19 +187,15 @@ def test_crossstore_directory_insert(insert_directories): # declaring directory as belonging to store 2, but parent's store is 1 cur.execute( 'INSERT INTO filedir VALUES ' - '(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', + '(?, ?, ?, ?, ?, ?, ?)', ( None, #id INTEGER PRIMARY KEY NOT NULL, 2, #store INTEGER NOT NULL, 'some_dir', #filename TEXT, -- only a filename, not a path 1, #parent INTEGER REFERENCES filedir ON UPDATE CASCADE, - False, #frozen BOOL NOT NULL, 'DIR', #filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details - 'dr-xr-xr--', #unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' - None, #symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target - '15c5e8d80a48803c18e72cd274532d608b8026dcbc192afc490fe1c289ec6ff1', #sha256 TEXT NOT NULL, - None, #source_program INTEGER, -- Note that this is redundant since datum points to a program... - None, #source_datum INTEGER, + False, #frozen BOOL NOT NULL, + False, #deleted BOOL NOT NULL, )) for row in cur.connection.iterdump(): print(row) @@ -224,31 +208,23 @@ def insert_files(insert_directories): cur = insert_directories cur.executemany( 'INSERT INTO filedir VALUES ' - '(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', + '(?, ?, ?, ?, ?, ?, ?)', [( None, #id INTEGER PRIMARY KEY NOT NULL, 1, #store INTEGER NOT NULL, 'example.csv', #filename TEXT, -- only a filename, not a path 1, #parent INTEGER REFERENCES filedir ON UPDATE CASCADE, - False, #frozen BOOL NOT NULL, 'REG', #filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details - 'drw-rw-r--', #unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' - None, #symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target - '9aafde8f9dbec34c694b86333f746f58958c44247c474904e06d1f07f94292b4', #sha256 TEXT NOT NULL, - None, #source_program INTEGER, -- Note that this is redundant since datum points to a program... - None, #source_datum INTEGER, + False, #frozen BOOL NOT NULL, + False, #deleted BOOL NOT NULL, ), ( None, #id INTEGER PRIMARY KEY NOT NULL, 1, #store INTEGER NOT NULL, 'plots.png', #filename TEXT, -- only a filename, not a path 2, #parent INTEGER REFERENCES filedir ON UPDATE CASCADE, - False, #frozen BOOL NOT NULL, 'REG', #filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details - 'drw-r--r--', #unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' - None, #symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target - '9add10cc3a6f0e4618dfed005ddfbeafdf268c58b773ba0021963c856d00235b', #sha256 TEXT NOT NULL, - None, #source_program INTEGER, -- Note that this is redundant since datum points to a program... - None, #source_datum INTEGER, + False, #frozen BOOL NOT NULL, + False, #deleted BOOL NOT NULL, )] ) return cur @@ -259,21 +235,26 @@ def test_nondir_parent_directory_insert(insert_files): # declaring parent as 5, but 5 is a file (plots.png) cur.execute( 'INSERT INTO filedir VALUES ' - '(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', + '(?, ?, ?, ?, ?, ?, ?)', ( None, #id INTEGER PRIMARY KEY NOT NULL, 2, #store INTEGER NOT NULL, 'some_filedir.txt', #filename TEXT, -- only a filename, not a path 5, #parent INTEGER REFERENCES filedir ON UPDATE CASCADE, - False, #frozen BOOL NOT NULL, 'DIR', #filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details - 'dr-xr-xr--', #unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' - None, #symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target - '15c5e8d80a48803c18e72cd274532d608b8026dcbc192afc490fe1c289ec6ff1', #sha256 TEXT NOT NULL, - None, #source_program INTEGER, -- Note that this is redundant since datum points to a program... - None, #source_datum INTEGER, + False, #frozen BOOL NOT NULL, + False, #deleted BOOL NOT NULL, )) for row in cur.connection.iterdump(): print(row) cur.execute('SELECT * FROM filedir') print(cur.fetchall()) + + +def test_update_deleted_file(insert_files): + cur = insert_files + # first we set a file to deleted + cur.execute('UPDATE filedir SET deleted=True WHERE id=5') + with pytest.raises(sqlite3.IntegrityError): + # Now we try and update it, which should fail due to trigger + cur.execute('UPDATE filedir SET frozen=True WHERE id=5')