Clean up schema and add filedir_versions table

This commit is contained in:
Jacob Hinkle 2022-09-22 15:47:49 -04:00
parent 9d48d4d227
commit fb6a54c6ef
2 changed files with 143 additions and 109 deletions

View File

@ -8,6 +8,21 @@ PRAGMA foreign_keys=ON;
-- facilitate easy importing of one database into another
-- This lets us enable or disable triggers. For example, when importing we need
-- to update some tables to account for changing primary keys using ON UPDATE
-- CASCADE. However, some dependent tables like filedir may have a trigger that
-- prevents all updates: e.g. for deleted files. In this case, we need to be
-- able to disable that one specific trigger during imports, then re-enable it.
-- We would do so by updating this table and setting enabled=False/True.
--
-- Note that entries in this table only need to be inserted when we know we have
-- a need to disable the trigger. Otherwise, this table can be ignored.
CREATE TABLE triggers(
name TEXT PRIMARY KEY,
enabled BOOL
);
-- We track host machines, so that we can give better information about file
-- locations. This also allows us to see whether platform-dependent behavior may
-- come into play. Note that these are not reliable identifiers for purposes of
@ -79,42 +94,90 @@ CREATE TABLE filedir (id INTEGER PRIMARY KEY NOT NULL,
filename TEXT, -- only a filename, not a path
parent INTEGER REFERENCES filedir ON UPDATE CASCADE,
frozen BOOL NOT NULL,
filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details
unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--'
symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target
-- Note that changes in filetype are seen as a delete/replace instead of an
-- update. If such a change is noticed, the original filedir entry must be
-- marked deleted and a new one created with the new filetype.
sha256 TEXT NOT NULL,
source_program INTEGER, -- Note that this is redundant since datum points to a program...
source_datum INTEGER,
frozen BOOL NOT NULL,
deleted BOOL NOT NULL, -- set True when recording a deleted file
-- NOTE: a deleted file should not be updated, other than through a cascade
UNIQUE(store, filename, parent),
FOREIGN KEY (store) REFERENCES store (id) ON UPDATE CASCADE,
FOREIGN KEY (source_datum) REFERENCES datum (id) ON UPDATE CASCADE,
FOREIGN KEY (source_program) REFERENCES program (id) ON UPDATE CASCADE
FOREIGN KEY (store) REFERENCES store (id) ON UPDATE CASCADE
);
CREATE TRIGGER insert_filedir BEFORE INSERT ON filedir
BEGIN
SELECT
CASE
WHEN NEW.parent IS NOT NULL AND (SELECT filetype FROM filedir WHERE id = NEW.parent) != 'DIR'
BEGIN SELECT CASE
WHEN NEW.parent IS NOT NULL AND (SELECT filetype FROM filedir WHERE id = NEW.parent) != 'DIR'
THEN RAISE (ABORT, 'Parent is not listed as a directory')
WHEN NEW.parent IS NOT NULL AND NEW.store != (SELECT store FROM filedir WHERE id = NEW.parent)
WHEN NEW.parent IS NOT NULL AND NEW.store != (SELECT store FROM filedir WHERE id = NEW.parent)
THEN RAISE (ABORT, 'Parent directory resides in different store')
END;
END;
END; END;
CREATE TRIGGER update_filedir BEFORE UPDATE ON filedir
BEGIN
SELECT
CASE
WHEN NEW.parent IS NOT NULL AND (SELECT filetype FROM filedir WHERE id = NEW.parent) != 'DIR'
BEGIN SELECT CASE
WHEN NEW.parent IS NOT NULL AND (SELECT filetype FROM filedir WHERE id = NEW.parent) != 'DIR'
THEN RAISE (ABORT, 'Parent is not listed as a directory')
WHEN NEW.parent IS NOT NULL AND NEW.store != (SELECT store FROM filedir WHERE id = NEW.parent)
WHEN NEW.parent IS NOT NULL AND NEW.store != (SELECT store FROM filedir WHERE id = NEW.parent)
THEN RAISE (ABORT, 'Parent directory resides in different store')
END;
END;
END; END;
-- Files that are recorded to be deleted have the deleted column set to True.
-- These files should no longer be used. In particular, we should never rename
-- or update them since their lifetime is over.
INSERT INTO triggers VALUES('update_deleted_filedir', TRUE);
CREATE TRIGGER update_deleted_filedir BEFORE UPDATE ON filedir
BEGIN SELECT CASE
WHEN OLD.deleted AND (SELECT enabled FROM triggers WHERE name = 'update_deleted_filedir')
THEN RAISE (ABORT, 'Cannot modify filedir entry for deleted file')
END; END;
-- This table holds _versions_ of files. This table holds information that is
-- independent of the location within the filesystem and merely indicates a
-- version of the content in a format that is nearly independent of filetype
-- (though the computation of content hashes is of course dependent on
-- filetype). Each version has a number, and was provided by some program (and
-- potentially a datum).
CREATE TABLE filedir_version (id INTEGER PRIMARY KEY NOT NULL,
filedir INTEGER REFERENCES filedir ON UPDATE CASCADE, -- parent filedir entry
version INTEGER NOT NULL, -- incremented whenever a new version of the file is recorded
-- We record the permissions on each file, in a way that enables reloading
-- permissions properly when thawing after a freeze operation.
unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--'
symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target. I.e. this is the "content" of the symlink.
-- The following hash can be NULL if the file was deleted. It could also be
-- null if hashing is deferred. Hash deferring would be preferred in cases
-- where a program must use a file or directory for multiple tasks, and each
-- task might mutate the directory. In these cases, each task will add an
-- additional version, but we do not want to hash every version since that
-- would needlessly slow down execution, for very little benefit since we
-- would be recording hashes for versions that may not be final. Instead, we
-- can defer by only hashing files and directories at the end of a program.
sha256 TEXT,
source_task INTEGER,
UNIQUE(filedir, version),
FOREIGN KEY (source_task) REFERENCES task (id) ON UPDATE CASCADE
);
-- Disallow inserting or deleting versions for deleted files. (the version
-- deleting the file should be added first, then the deleted field set on the
-- filedir row). The update trigger is toggleable to enable importing.
CREATE TRIGGER insert_filedir_version_parent_deleted BEFORE INSERT ON filedir_version
BEGIN SELECT CASE
WHEN (SELECT deleted FROM filedir WHERE id = NEW.parent)
THEN RAISE (ABORT, 'Attempted to insert filedir_version whose filedir was deleted')
END; END;
INSERT INTO triggers VALUES('update_filedir_version_parent_deleted', TRUE);
CREATE TRIGGER update_filedir_version_parent_deleted BEFORE UPDATE ON filedir_version
BEGIN SELECT CASE
WHEN (SELECT deleted FROM filedir WHERE id = NEW.parent)
AND (SELECT enabled FROM triggers WHERE name = 'update_filedir_version_parent_deleted')
THEN RAISE (ABORT, 'Attempted to update filedir_version whose parent was deleted')
END; END;
-- A computational environment which can execute "programs". Note that the
-- python executable being used, and environment variables are recorded here.
@ -160,34 +223,35 @@ CREATE TABLE program (id INTEGER PRIMARY KEY NOT NULL,
-- case some calling code winds up calling a previously-unloaded module.
CREATE TABLE py_package (id INTEGER PRIMARY KEY NOT NULL,
name TEXT NOT NULL,
version TEXT,
program INTEGER NOT NULL,
FOREIGN KEY (program) REFERENCES program (id) ON UPDATE CASCADE
version TEXT
);
-- A py_module describes any python module file containing decorated Functions.
-- Modules are tracked since they impact the global scope of function calls.
CREATE TABLE py_module(id INTEGER PRIMARY KEY NOT NULL,
name TEXT NOT NULL,
code TEXT,
package INTEGER,
FOREIGN KEY (package) REFERENCES py_package (id) ON UPDATE CASCADE
py_package INTEGER,
FOREIGN KEY (py_package) REFERENCES py_package (id) ON UPDATE CASCADE
);
-- A "func" just describes a function, without reference to its arguments. It
-- can have inputs and outputs, which are described in the func_inputs and
-- A py_function just describes a function, without reference to its arguments.
-- It can have inputs and outputs, which are described in the func_inputs and
-- func_outputs children tables.
CREATE TABLE func(id INTEGER PRIMARY KEY NOT NULL,
CREATE TABLE py_function(id INTEGER PRIMARY KEY NOT NULL,
name TEXT NOT NULL,
module INTEGER NOT NULL,
py_module INTEGER NOT NULL,
-- we use POSIX timestamps for time recording.
-- e.g. datetime.datetime.now().timestamp()
start_time REAL,
end_time REAL,
FOREIGN KEY (module) REFERENCES py_module (id) ON UPDATE CASCADE
FOREIGN KEY (py_module) REFERENCES py_module (id) ON UPDATE CASCADE
);
CREATE TABLE func_input(id INTEGER PRIMARY KEY NOT NULL,
CREATE TABLE py_function_input(id INTEGER PRIMARY KEY NOT NULL,
name TEXT NOT NULL,
typename TEXT NOT NULL,
func INTEGER NOT NULL,
py_function INTEGER NOT NULL,
position INTEGER,
posonly BOOL,
@ -195,28 +259,30 @@ CREATE TABLE func_input(id INTEGER PRIMARY KEY NOT NULL,
description TEXT,
FOREIGN KEY (func) REFERENCES func (id) ON UPDATE CASCADE
FOREIGN KEY (py_function) REFERENCES py_function (id) ON UPDATE CASCADE
);
CREATE TABLE func_output(id INTEGER PRIMARY KEY NOT NULL,
CREATE TABLE py_function_output(id INTEGER PRIMARY KEY NOT NULL,
name TEXT,
typename TEXT NOT NULL,
func INTEGER NOT NULL,
py_function INTEGER NOT NULL,
position INTEGER NOT NULL,
description TEXT,
FOREIGN KEY (func) REFERENCES func (id) ON UPDATE CASCADE
FOREIGN KEY (py_function) REFERENCES py_function (id) ON UPDATE CASCADE
);
-- Tasks are executed calls to Functions: they correspond to a particular set of
-- inputs which are themselves data (see datum table). A task is executed in
-- the context of a "program". Within a program, tasks are typically evaluated
-- in a serial manner.
-- Tasks are usually executed calls to Functions: they correspond to a
-- particular set of inputs which are themselves data (see datum table). A task
-- is executed in the context of a "program". Within a program, tasks are
-- typically evaluated in a serial manner.
CREATE TABLE task(id INTEGER PRIMARY KEY NOT NULL,
func INTEGER NOT NULL,
program INTEGER NOT NULL,
FOREIGN KEY (func) REFERENCES func (id) ON UPDATE CASCADE,
py_function INTEGER, -- func is NULL for some built-in functionality like "record" programs
FOREIGN KEY (py_function) REFERENCES py_function (id) ON UPDATE CASCADE,
FOREIGN KEY (program) REFERENCES program (id) ON UPDATE CASCADE
);
-- A datum is an object that is computed as the output of a task, given as a
@ -237,26 +303,13 @@ CREATE TABLE datum(id INTEGER PRIMARY KEY NOT NULL,
typename TEXT NOT NULL, -- string representation of the data type
saved_path TEXT, -- if residing on disk, the path relative to this db file
saved_version TEXT, -- if residing on disk, the path relative to this db file
-- hash of provenance summarizes all dependencies, independent of environment
-- computed _before_ computation
hash_provenance TEXT,
hash_content TEXT, -- hash of content is only computed when this datum is saved to disk
-- Version starts at 0 when provided/first computed. When passed as
-- a non-const input to a non-const function, this version recorded and
-- incremented
current_version INTEGER NOT NULL,
FOREIGN KEY (task) REFERENCES task (id) ON UPDATE CASCADE,
FOREIGN KEY (task_output) REFERENCES func_output (id) ON UPDATE CASCADE
);
-- A task input records the version of a Datum that is passed to the function
-- A task_input records the version of a Datum that is passed to a function
CREATE TABLE task_input(id INTEGER PRIMARY KEY NOT NULL,
task INTEGER NOT NULL,
func_input INTEGER NOT NULL,
py_function_input INTEGER, -- if this was a python function, reference which input
datum INTEGER NOT NULL,
-- Data have versions to facilitate tracking non-const operations. If a datum
@ -265,6 +318,6 @@ CREATE TABLE task_input(id INTEGER PRIMARY KEY NOT NULL,
datum_version INTEGER NOT NULL,
FOREIGN KEY (task) REFERENCES task (id) ON UPDATE CASCADE,
FOREIGN KEY (func_input) REFERENCES func_input (id) ON UPDATE CASCADE,
FOREIGN KEY (py_function_input) REFERENCES py_function_input (id) ON UPDATE CASCADE,
FOREIGN KEY (datum) REFERENCES datum (id) ON UPDATE CASCADE
);

View File

@ -152,43 +152,31 @@ def insert_directories(insert_store):
cur = insert_store
cur.executemany(
'INSERT INTO filedir VALUES '
'(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',
'(?, ?, ?, ?, ?, ?, ?)',
[(
None, #id INTEGER PRIMARY KEY NOT NULL,
1, #store INTEGER NOT NULL,
'.', #filename TEXT, -- only a filename, not a path
None, #parent INTEGER REFERENCES filedir ON UPDATE CASCADE,
False, #frozen BOOL NOT NULL,
'DIR', #filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details
'dr-xr-xr--', #unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--'
None, #symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target
'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855', #sha256 TEXT NOT NULL,
None, #source_program INTEGER, -- Note that this is redundant since datum points to a program...
None, #source_datum INTEGER,
False, #frozen BOOL NOT NULL,
False, #deleted BOOL NOT NULL,
), (
None, #id INTEGER PRIMARY KEY NOT NULL,
1, #store INTEGER NOT NULL,
'foo', #filename TEXT, -- only a filename, not a path
1, #parent INTEGER REFERENCES filedir ON UPDATE CASCADE,
False, #frozen BOOL NOT NULL,
'DIR', #filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details
'dr-xr-xr--', #unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--'
None, #symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target
'5ad4e9e3090de8de781e4a35ce6ba16ad8eaba7a5456cd230a36f77143885396', #sha256 TEXT NOT NULL,
None, #source_program INTEGER, -- Note that this is redundant since datum points to a program...
None, #source_datum INTEGER,
False, #frozen BOOL NOT NULL,
False, #deleted BOOL NOT NULL,
), (
None, #id INTEGER PRIMARY KEY NOT NULL,
2, #store INTEGER NOT NULL,
'.', #filename TEXT, -- only a filename, not a path
None, #parent INTEGER REFERENCES filedir ON UPDATE CASCADE,
False, #frozen BOOL NOT NULL,
'DIR', #filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details
'dr-xr-xr--', #unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--'
None, #symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target
'15c5e8d80a48803c18e72cd274532d608b8026dcbc192afc490fe1c289ec6ff1', #sha256 TEXT NOT NULL,
None, #source_program INTEGER, -- Note that this is redundant since datum points to a program...
None, #source_datum INTEGER,
False, #frozen BOOL NOT NULL,
False, #deleted BOOL NOT NULL,
)],
)
return cur
@ -199,19 +187,15 @@ def test_crossstore_directory_insert(insert_directories):
# declaring directory as belonging to store 2, but parent's store is 1
cur.execute(
'INSERT INTO filedir VALUES '
'(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',
'(?, ?, ?, ?, ?, ?, ?)',
(
None, #id INTEGER PRIMARY KEY NOT NULL,
2, #store INTEGER NOT NULL,
'some_dir', #filename TEXT, -- only a filename, not a path
1, #parent INTEGER REFERENCES filedir ON UPDATE CASCADE,
False, #frozen BOOL NOT NULL,
'DIR', #filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details
'dr-xr-xr--', #unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--'
None, #symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target
'15c5e8d80a48803c18e72cd274532d608b8026dcbc192afc490fe1c289ec6ff1', #sha256 TEXT NOT NULL,
None, #source_program INTEGER, -- Note that this is redundant since datum points to a program...
None, #source_datum INTEGER,
False, #frozen BOOL NOT NULL,
False, #deleted BOOL NOT NULL,
))
for row in cur.connection.iterdump():
print(row)
@ -224,31 +208,23 @@ def insert_files(insert_directories):
cur = insert_directories
cur.executemany(
'INSERT INTO filedir VALUES '
'(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',
'(?, ?, ?, ?, ?, ?, ?)',
[(
None, #id INTEGER PRIMARY KEY NOT NULL,
1, #store INTEGER NOT NULL,
'example.csv', #filename TEXT, -- only a filename, not a path
1, #parent INTEGER REFERENCES filedir ON UPDATE CASCADE,
False, #frozen BOOL NOT NULL,
'REG', #filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details
'drw-rw-r--', #unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--'
None, #symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target
'9aafde8f9dbec34c694b86333f746f58958c44247c474904e06d1f07f94292b4', #sha256 TEXT NOT NULL,
None, #source_program INTEGER, -- Note that this is redundant since datum points to a program...
None, #source_datum INTEGER,
False, #frozen BOOL NOT NULL,
False, #deleted BOOL NOT NULL,
), (
None, #id INTEGER PRIMARY KEY NOT NULL,
1, #store INTEGER NOT NULL,
'plots.png', #filename TEXT, -- only a filename, not a path
2, #parent INTEGER REFERENCES filedir ON UPDATE CASCADE,
False, #frozen BOOL NOT NULL,
'REG', #filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details
'drw-r--r--', #unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--'
None, #symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target
'9add10cc3a6f0e4618dfed005ddfbeafdf268c58b773ba0021963c856d00235b', #sha256 TEXT NOT NULL,
None, #source_program INTEGER, -- Note that this is redundant since datum points to a program...
None, #source_datum INTEGER,
False, #frozen BOOL NOT NULL,
False, #deleted BOOL NOT NULL,
)]
)
return cur
@ -259,21 +235,26 @@ def test_nondir_parent_directory_insert(insert_files):
# declaring parent as 5, but 5 is a file (plots.png)
cur.execute(
'INSERT INTO filedir VALUES '
'(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',
'(?, ?, ?, ?, ?, ?, ?)',
(
None, #id INTEGER PRIMARY KEY NOT NULL,
2, #store INTEGER NOT NULL,
'some_filedir.txt', #filename TEXT, -- only a filename, not a path
5, #parent INTEGER REFERENCES filedir ON UPDATE CASCADE,
False, #frozen BOOL NOT NULL,
'DIR', #filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details
'dr-xr-xr--', #unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--'
None, #symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target
'15c5e8d80a48803c18e72cd274532d608b8026dcbc192afc490fe1c289ec6ff1', #sha256 TEXT NOT NULL,
None, #source_program INTEGER, -- Note that this is redundant since datum points to a program...
None, #source_datum INTEGER,
False, #frozen BOOL NOT NULL,
False, #deleted BOOL NOT NULL,
))
for row in cur.connection.iterdump():
print(row)
cur.execute('SELECT * FROM filedir')
print(cur.fetchall())
def test_update_deleted_file(insert_files):
cur = insert_files
# first we set a file to deleted
cur.execute('UPDATE filedir SET deleted=True WHERE id=5')
with pytest.raises(sqlite3.IntegrityError):
# Now we try and update it, which should fail due to trigger
cur.execute('UPDATE filedir SET frozen=True WHERE id=5')