diff --git a/pyproject.toml b/pyproject.toml index 7042148..0c81748 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ description = "Composable tracking of scientific data provenance" authors = ["Jacob Hinkle "] [tool.poetry.dependencies] -python = "^3.8" +python = "^3.10" click = "^8.1.3" colorama = "^0.4.5" loguru = "^0.6.0" @@ -28,7 +28,7 @@ build-backend = "poetry.core.masonry.api" [tool.tox] legacy_tox_ini = """ [tox] -envlist = py38,py39,py310,mypy +envlist = mypy,py310 isolated_build = true [testenv] @@ -40,7 +40,10 @@ commands = pytest --cov src/nancy [testenv:mypy] -deps = mypy +deps = + mypy + pytest commands = - mypy --strict -p nancy + mypy --strict --ignore-missing-imports --no-incremental -p nancy + mypy --strict --no-incremental tests/ """ diff --git a/src/nancy/cli/diff.py b/src/nancy/cli/diff.py index 4afcfdf..6143846 100644 --- a/src/nancy/cli/diff.py +++ b/src/nancy/cli/diff.py @@ -48,13 +48,16 @@ def print_diff( dirstr = ( (filetypecolors["DIR"] + dname + "/" + reset) if dname != "" else "" ) - fname = filetypecolors.get(entry.filetype, "") + fname + reset + assert entry.filetype is not None + fname = filetypecolors.get(str(entry.filetype), "") + fname + reset - if entry.filetype == "LNK": # append symlink target + if entry.filetype == fs.FileType.LNK: # append symlink target + assert entry.symlink_target is not None fname += " -> " + entry.symlink_target relpath = dirstr + fname + assert entry.sha256 is not None hashchange = ( (hashcolor + entry.sha256.hex() + reset + " " + changetags[tag]) if show_hashes diff --git a/src/nancy/cli/freeze.py b/src/nancy/cli/freeze.py index 18e2867..35a6949 100644 --- a/src/nancy/cli/freeze.py +++ b/src/nancy/cli/freeze.py @@ -3,11 +3,12 @@ import click from .. import store import os +from typing import List @click.command() @click.argument("directory") -def freeze(directory): +def freeze(directory: str) -> None: """ Initialize tracking in a directory or freeze a tracked directory. @@ -25,12 +26,12 @@ def freeze(directory): else: # this is an existing store s = store.Store(directory) - s.freeze() + # s.freeze() @click.command() @click.argument("files", nargs=-1) # , help="Files or directories to thaw.") -def thaw(files): +def thaw(files: List[str]) -> None: """ Enable manual alteration of files within a tracked directory. diff --git a/src/nancy/cli/record.py b/src/nancy/cli/record.py index b333e05..fb27928 100644 --- a/src/nancy/cli/record.py +++ b/src/nancy/cli/record.py @@ -32,7 +32,7 @@ def record( if not os.path.exists(os.path.join(store_path, "nancy.db")): # this is a new store logger.info(f"Initializing new store in {store_path}...") - s = store.Store.init(store_path) + s = store.Store.init(message=message, directory=store_path) else: # this is an existing store s = store.Store(store_path) @@ -78,7 +78,7 @@ def record( "of given paths. If given the path to a non-store directory, a new " "store is initialized there.", ) -def record_cli(show_hashes, no_color, message, store): +def record_cli(show_hashes: bool, no_color: bool, message: str, store: str) -> None: """ Initialize tracking or record changes to a tracked directory. """ diff --git a/src/nancy/environment.py b/src/nancy/environment.py index 5035299..bdfd7f1 100644 --- a/src/nancy/environment.py +++ b/src/nancy/environment.py @@ -1,83 +1,79 @@ from . import user +from dataclasses import dataclass +import hashlib import json import os import platform import sqlite3 import sys -from typing import NamedTuple, Optional, TypeVar, Type +import time +from typing import Callable, Optional, TypeVar, Type # see https://stackoverflow.com/questions/44640479/type-annotation-for-classmethod-returning-instance _EnvironmentT = TypeVar("_EnvironmentT", bound="Environment") -class Environment(NamedTuple): - id: Optional[int] +@dataclass(frozen=True, slots=True) +class Environment: envvars_json: str python_implementation: str python_strversion: str python_hexversion: int user: user.User + timezone: str + release: str + freedesktop_os_release: str + win32_ver: str + mac_ver: str + sha256: bytes = b"" - @classmethod - def find_or_insert( - cls: Type[_EnvironmentT], - cur: sqlite3.Cursor, - env: Optional[_EnvironmentT] = None, - ) -> _EnvironmentT: - """Given a DB cursor, find or create row in environment table and fill""" - if env is None: - env = cls.detect() + def __post_init__(self) -> None: + m = hashlib.sha256() + upstr: Callable[[str], None] = lambda s: m.update(bytes(s, "utf-8")) + upint: Callable[[int], None] = lambda n: m.update( + n.to_bytes((n.bit_length() + 7) // 8, byteorder="big"), + ) + upstr(self.envvars_json) + upstr(self.python_implementation) + upstr(self.python_strversion) + upint(self.python_hexversion) + m.update(self.user.sha256) + upstr(self.timezone) + upstr(self.release) + upstr(self.freedesktop_os_release) + upstr(self.win32_ver) + upstr(self.mac_ver) + # circumvent freezing to set sha256 + object.__setattr__(self, "sha256", m.digest()) - u = user.User.find_or_insert(cur) + def maybe_insert(self, cur: sqlite3.Cursor) -> str: + """Insert if not exists, without committing.""" + self.user.maybe_insert(cur) - env = env._replace(user=u) + key = self.sha256.hex() - # insert or ignore, handle each case to set id cur.execute( """ - SELECT - id - FROM - environment - WHERE - envvars_json = ? AND - python_implementation = ? AND - python_strversion = ? AND - python_hexversion = ? AND - user = ? - LIMIT 1 + INSERT OR IGNORE INTO environment VALUES (?,?,?,?,?,?,?,?,?,?,?); """, ( - env.envvars_json, - env.python_implementation, - env.python_strversion, - env.python_hexversion, - env.user.id, + key, + self.envvars_json, + self.python_implementation, + self.python_strversion, + self.python_hexversion, + self.user.sha256.hex(), + self.timezone, + self.release, + self.freedesktop_os_release, + self.win32_ver, + self.mac_ver, ), ) - res = cur.fetchone() - if res is None: - cur.execute( - """ - INSERT INTO environment VALUES (?,?,?,?,?,?); - """, - ( - env.id, - env.envvars_json, - env.python_implementation, - env.python_strversion, - env.python_hexversion, - env.user.id, - ), - ) - id = cur.lastrowid - cur.connection.commit() - else: - id = res[0] - return env._replace(id=id) + return key @classmethod def detect(cls: Type[_EnvironmentT]) -> _EnvironmentT: @@ -87,11 +83,22 @@ class Environment(NamedTuple): """ u = user.User.detect() + fdor = "" + try: + fdor = json.dumps(platform.freedesktop_os_release()) + except AttributeError: + # freedesktop_os_release only available for python >= 3.10 + fdor = "" + return cls( - None, json.dumps(dict(os.environ)), platform.python_implementation(), sys.version, sys.hexversion, u, + timezone=time.tzname[time.daylight], + release=platform.release(), + freedesktop_os_release=fdor, + win32_ver=json.dumps(platform.win32_ver()), + mac_ver=json.dumps(platform.mac_ver()), ) diff --git a/src/nancy/fs.py b/src/nancy/fs.py index e55771f..c52993d 100644 --- a/src/nancy/fs.py +++ b/src/nancy/fs.py @@ -203,7 +203,7 @@ class FSEntry: ) s = filestat.st_mode - children = [] + children: List[FSEntry] = [] symlink_target: Optional[Union[str, bytes]] = None if os.path.islink(path): # Check links first, since it is not exclusive with dir or file checks @@ -518,7 +518,7 @@ class FSDiff: allnames = set(list(Alist.keys()) + list(Blist.keys())) - modified_children = [ + modified_children: List[FSDiff] = [ cls.compute( Alist.get(n, None), Blist.get(n, None), diff --git a/src/nancy/machine.py b/src/nancy/machine.py index bef26d5..9fcc94e 100644 --- a/src/nancy/machine.py +++ b/src/nancy/machine.py @@ -1,104 +1,83 @@ -from typing import NamedTuple, Optional, Type, TypeVar +from dataclasses import dataclass +import hashlib import json import platform import sqlite3 import time +from typing import Callable, NamedTuple, Optional, Type, TypeVar # see https://stackoverflow.com/questions/44640479/type-annotation-for-classmethod-returning-instance _MachineT = TypeVar("_MachineT", bound="Machine") -class Machine(NamedTuple): - id: Optional[int] +@dataclass(frozen=True, slots=True) +class Machine: machine_id: Optional[str] hostname: str - processor: str system: str - release: str cpu_type: str - timezone: str - freedesktop_os_release: str - win32_ver: str - mac_ver: str + processor: str + sha256: bytes = b"" - @classmethod - def find_or_insert( - cls: Type[_MachineT], cur: sqlite3.Cursor, machine: Optional[_MachineT] = None - ) -> _MachineT: - """Given a DB cursor, find or create row in machine table and fill""" - if machine is None: - machine = cls.detect() + def __post_init__(self) -> None: + """Derive sha256 from other attributes.""" + m = hashlib.sha256() + upstr: Callable[[str], None] = lambda s: m.update(bytes(s, "utf-8")) + if self.machine_id is not None: + upstr(self.machine_id) + upstr(self.hostname) + upstr(self.processor) + upstr(self.system) + upstr(self.cpu_type) + # circumvent freezing to set sha256 + object.__setattr__(self, "sha256", m.digest()) + + def maybe_insert(self, cur: sqlite3.Cursor) -> str: + """Insert if not exists, without committing.""" + key = self.sha256.hex() - # insert or ignore, handle each case to set id cur.execute( """ - SELECT - id - FROM - machine - WHERE - machine_id = ? AND - hostname = ? AND - processor = ? AND - system = ? AND - release = ? AND - cpu_type = ? AND - timezone = ? AND - freedesktop_os_release = ? AND - win32_ver = ? AND - mac_ver = ? - LIMIT 1 + INSERT OR IGNORE INTO machine VALUES (?,?,?,?,?,?); """, - machine[1:], + ( + key, + self.machine_id, + self.hostname, + self.system, + self.cpu_type, + self.processor, + ), ) - res = cur.fetchone() - if res is None: - cur.execute( - """ - INSERT INTO machine VALUES (?,?,?,?,?,?,?,?,?,?,?); - """, - machine, - ) - id = cur.lastrowid - cur.connection.commit() - else: - id = res[0] - return machine._replace(id=id) + return key @classmethod def detect(cls: Type[_MachineT]) -> _MachineT: - """Formats machine-specific information into a MachineInfo object. + """Format machine-specific information into a MachineInfo object. Note that 'MachineInfo' objects are properly formatted to be inserted into the `machine` table. + + Raises: + NotImplementedError: if system != "Linux" """ - fdor = "" - try: - fdor = json.dumps(platform.freedesktop_os_release()) - except AttributeError: - # freedesktop_os_release only available for python >= 3.10 - fdor = "" system = platform.system() mid = None - if system == "Linux": + if system.lower() == "linux": try: mid = open("/etc/machine-id", "r").read() except FileNotFoundError: pass + else: + raise NotImplementedError() return cls( - None, machine_id=mid, hostname=platform.node(), - processor=platform.processor(), system=system, - release=platform.release(), cpu_type=platform.machine(), - timezone=time.tzname[time.daylight], - freedesktop_os_release=fdor, - win32_ver=json.dumps(platform.win32_ver()), - mac_ver=json.dumps(platform.mac_ver()), + processor=platform.processor(), ) diff --git a/src/nancy/schema/version0.sql b/src/nancy/schema/version0.sql index 0dcc9d8..beb0104 100644 --- a/src/nancy/schema/version0.sql +++ b/src/nancy/schema/version0.sql @@ -8,6 +8,15 @@ PRAGMA foreign_keys=ON; -- facilitate easy importing of one database into another +-- Simple key/value table describing _this_ store (not imported ones). +-- In particular, the key "local_store" should be the UUID of the store in this +-- directory. +CREATE TABLE local_metadata( + key TEXT PRIMARY KEY NOT NULL, + value TEXT +); + + -- This lets us enable or disable triggers. For example, when importing we need -- to update some tables to account for changing primary keys using ON UPDATE -- CASCADE. However, some dependent tables like filedir may have a trigger that @@ -18,58 +27,38 @@ PRAGMA foreign_keys=ON; -- Note that entries in this table only need to be inserted when we know we have -- a need to disable the trigger. Otherwise, this table can be ignored. CREATE TABLE triggers( - name TEXT PRIMARY KEY, + name TEXT PRIMARY KEY NOT NULL, enabled BOOL ); -- We track host machines, so that we can give better information about file -- locations. This also allows us to see whether platform-dependent behavior may --- come into play. Note that these are not reliable identifiers for purposes of --- tracking down data, but may be helpful context. -CREATE TABLE machine(id INTEGER PRIMARY KEY NOT NULL, +-- come into play. +-- A row of this table should describe the aspects of a node that are unlikely +-- to change during regular software updates. For example, the hostname and CPU +-- type are included but not the kernel version. Software that changes due to +-- updates should be included in the "environment" table instead. +CREATE TABLE machine( + sha256 TEXT PRIMARY KEY NOT NULL, machine_id TEXT, -- platform-dependent unique hardware id -- Linux: open('/etc/machine-id', 'r').read() (assumes systemd) - -- OSX: `ioreg -rd1 -c IOPlatformExpertDevice | grep IOPlatformUUID` + -- OSX: `ioreg -rd1 -c IOPlatformExpertDevice | grep IOPlatformUUID | awk '{$print $3}' | tr -d \"` -- Windows: `reg query HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Cryptography /v MachineGuid` hostname TEXT, -- platform.node(): 'lucky' - processor TEXT, -- platform.processor(): system TEXT, -- platform.system(): 'Linux' - release TEXT, -- platform.release(): '5.15.64' cpu_type TEXT, -- platform.machine(): 'x86_64' - timezone TEXT, -- timezone, for interpreting event times - freedesktop_os_release TEXT, -- requires python 3.10 - -- platform.freedesktop_os_release() as JSON - -- "{'NAME': 'NixOS', 'ID': 'nixos', - -- 'PRETTY_NAME': 'NixOS 22.05 (Quokka)', 'BUG_REPORT_URL': - -- 'https://github.com/NixOS/nixpkgs/issues', 'BUILD_ID': - -- '22.05.20220902.67e4507', 'DOCUMENTATION_URL': - -- 'https://nixos.org/learn.html', 'HOME_URL': 'https://nixos.org/', 'LOGO': - -- 'nix-snowflake', 'SUPPORT_URL': 'https://nixos.org/community.html', - -- 'VERSION': '22.05 (Quokka)', 'VERSION_CODENAME': 'quokka', 'VERSION_ID': - -- '22.05'}" - win32_ver TEXT, -- platform.win32_ver() as JSON - mac_ver TEXT, -- platform.mac_ver() as JSON - - -- disallow duplicate rows - UNIQUE(hostname, processor, system, release, cpu_type, timezone, - freedesktop_os_release, win32_ver, mac_ver) + processor TEXT -- platform.processor(): ); -- Programs are run by users on machines -CREATE TABLE user(id INTEGER PRIMARY KEY NOT NULL, +CREATE TABLE user( + sha256 TEXT PRIMARY KEY NOT NULL, username TEXT NOT NULL, -- getpass.getuser() userid INTEGER, -- os.getuid() fullname TEXT, -- on Linux/OSX: pwd.getpwuid(os.getuid()).pw_gecos -- on Windows: see https://stackoverflow.com/questions/21766954/how-to-get-windows-users-full-name-in-python - machine INTEGER NOT NULL, - - -- usernames and fullnames change rarely, but it happens. A more restrictive - -- constraint would not enable tracking of this info. The following only - -- guarantees no duplicate full rows. - UNIQUE(username, userid, fullname, machine), - - FOREIGN KEY (machine) REFERENCES machine (id) ON UPDATE CASCADE + machine TEXT NOT NULL REFERENCES machine ON UPDATE CASCADE ); @@ -88,41 +77,32 @@ CREATE TABLE user(id INTEGER PRIMARY KEY NOT NULL, -- Client code should generate random UUIDs in the RFC 4122 variant layout. -- https://datatracker.ietf.org/doc/html/rfc4122.html -- This is possible in Python by simply calling uuid.uuid4() with no arguments -CREATE TABLE store (id INTEGER PRIMARY KEY NOT NULL, - machine INTEGER, - dbpath TEXT NOT NULL, - uuid TEXT NOT NULL, -- UUID generated by str(uuid.uuid4()) - imported BOOL, -- is this the store for the current directory? If not, then it was - -- imported in order to describe a dependency. - - FOREIGN KEY (machine) REFERENCES machine (id) ON UPDATE CASCADE +CREATE TABLE store ( + uuid TEXT PRIMARY KEY NOT NULL -- UUID generated by str(uuid.uuid4()) ); -- The filedir table holds all files and directories that are tracked by the -- store. This table also holds tracked files and directories that have been -- imported and live outside the current store. --- We do not support renaming files. Once an entry is created here, it should --- only be updated to reflect frozen/thawed status. -CREATE TABLE filedir (id INTEGER PRIMARY KEY NOT NULL, - store INTEGER NOT NULL, +-- We do not support renaming files. +CREATE TABLE filedir ( + uuid TEXT PRIMARY KEY NOT NULL, + store TEXT NOT NULL, name TEXT, -- only a filename, not a path - parent INTEGER REFERENCES filedir ON UPDATE CASCADE, - frozen BOOL NOT NULL, - + parent TEXT REFERENCES filedir ON UPDATE CASCADE, UNIQUE(store, name, parent) ); -- Detect cross-store references CREATE TRIGGER insert_filedir BEFORE INSERT ON filedir BEGIN SELECT CASE - WHEN NEW.parent IS NOT NULL AND NEW.store != (SELECT store FROM filedir WHERE id = NEW.parent) + WHEN NEW.parent IS NOT NULL AND NEW.store != (SELECT store FROM filedir WHERE uuid = NEW.parent) THEN RAISE (ABORT, 'Parent resides in different store') END; END; CREATE TRIGGER update_filedir BEFORE UPDATE ON filedir -BEGIN SELECT CASE - WHEN (NEW.id != OLD.id OR NEW.store != OLD.store OR NEW.parent != OLD.parent) - THEN RAISE (ABORT, 'The only updates to filedir allowed are to the frozen column') -END; END; +BEGIN + SELECT RAISE (ABORT, 'No updates to filedir allowed'); +END; -- This table holds _versions_ of files and directories. This table holds @@ -131,8 +111,9 @@ END; END; -- independent of filetype (though the computation of content hashes is of -- course dependent on filetype). Each version has a number, and was provided by -- some program (and potentially a datum). -CREATE TABLE filedir_version (id INTEGER PRIMARY KEY NOT NULL, - filedir INTEGER NOT NULL +CREATE TABLE filedir_version ( + uuid TEXT PRIMARY KEY NOT NULL, + filedir TEXT NOT NULL REFERENCES filedir ON UPDATE CASCADE, -- parent filedir entry recorded_time REAL, -- When was this version recorded? @@ -144,7 +125,7 @@ CREATE TABLE filedir_version (id INTEGER PRIMARY KEY NOT NULL, -- We record the permissions on each file, in a way that enables reloading -- permissions properly when thawing after a freeze operation. - unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' + perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target. i.e. this is the "content" of the symlink. @@ -158,7 +139,7 @@ CREATE TABLE filedir_version (id INTEGER PRIMARY KEY NOT NULL, -- can defer by only hashing files and directories at the end of a program. sha256 TEXT, - source_task INTEGER REFERENCES task (id) ON UPDATE CASCADE + source_task TEXT REFERENCES task ON UPDATE CASCADE ); -- Disallow UPDATING filedir_version. Instead, new version should be created. -- One exception is during importing, in which case we can disable the trigger @@ -166,8 +147,7 @@ INSERT INTO triggers VALUES('update_filedir_version', TRUE); CREATE TRIGGER update_filedir_version BEFORE UPDATE ON filedir_version BEGIN SELECT CASE WHEN (SELECT enabled FROM triggers WHERE name = 'update_filedir_version') - THEN RAISE (ABORT, - 'Updating filedir versions is prohibited, other than for id changes') + THEN RAISE (ABORT, 'Updating filedir versions is prohibited') END; END; -- TODO: check for inserting inconsistent version info re. deleted flag/filetype @@ -176,17 +156,28 @@ END; END; -- A computational environment which can execute "programs". Note that the -- python executable being used, and environment variables are recorded here. -- Other info is available in the parent "machine" table. -CREATE TABLE environment (id INTEGER PRIMARY KEY NOT NULL, +CREATE TABLE environment ( + sha256 TEXT PRIMARY KEY NOT NULL, + envvars_json TEXT, -- json.dumps(dict(os.environ)) python_implementation TEXT, -- platform.python_implementation(): 'cpython' python_strversion TEXT, -- sys.version: '3.9.7 (default, Sep 16 2021, 13:09:58) \n[GCC 7.5.0]' python_hexversion INTEGER, -- sys.hexversion: 50923504 - user INTEGER, - - -- disallow duplicate rows - UNIQUE(envvars_json, python_implementation, python_strversion, python_hexversion, user), - - FOREIGN KEY (user) REFERENCES user (id) ON UPDATE CASCADE + user TEXT REFERENCES user ON UPDATE CASCADE, + timezone TEXT, -- timezone, for interpreting event times + platform_release TEXT, -- platform.release(): '5.15.64' + freedesktop_os_release TEXT, -- requires python 3.10 + -- platform.freedesktop_os_release() as JSON + -- "{'NAME': 'NixOS', 'ID': 'nixos', + -- 'PRETTY_NAME': 'NixOS 22.05 (Quokka)', 'BUG_REPORT_URL': + -- 'https://github.com/NixOS/nixpkgs/issues', 'BUILD_ID': + -- '22.05.20220902.67e4507', 'DOCUMENTATION_URL': + -- 'https://nixos.org/learn.html', 'HOME_URL': 'https://nixos.org/', 'LOGO': + -- 'nix-snowflake', 'SUPPORT_URL': 'https://nixos.org/community.html', + -- 'VERSION': '22.05 (Quokka)', 'VERSION_CODENAME': 'quokka', 'VERSION_ID': + -- '22.05'}" + win32_ver TEXT, -- platform.win32_ver() as JSON + mac_ver TEXT -- platform.mac_ver() as JSON ); @@ -195,7 +186,8 @@ CREATE TABLE environment (id INTEGER PRIMARY KEY NOT NULL, -- python script, multiple "programs" may be created. Also note that historical -- programs are automatically imported and merged when possible when loading a -- "datum" from disk. -CREATE TABLE program (id INTEGER PRIMARY KEY NOT NULL, +CREATE TABLE program ( + uuid TEXT PRIMARY KEY NOT NULL, name TEXT, -- name of the program, usually written lowercase by calling code e.g. cnn_crossval -- Names of built-in operations will be shown in upper case: e.g. 'FREEZE' @@ -205,65 +197,57 @@ CREATE TABLE program (id INTEGER PRIMARY KEY NOT NULL, end_time REAL, process_id INTEGER, -- host PID of python process on host OS - environment INTEGER NOT NULL, - message TEXT, -- user-defined message to help distinguish similar runs - - FOREIGN KEY (environment) REFERENCES environment (id) ON UPDATE CASCADE + environment TEXT NOT NULL REFERENCES environment ON UPDATE CASCADE, + message TEXT NOT NULL -- user-defined message to help distinguish similar runs ); -- We try to track all python packages that impact execution by traversing a -- copy of sys.modules. This is done once before a "program" and once after in -- case some calling code winds up calling a previously-unloaded module. -CREATE TABLE py_package (id INTEGER PRIMARY KEY NOT NULL, +CREATE TABLE py_package ( + sha256 TEXT PRIMARY KEY NOT NULL, name TEXT NOT NULL, - version TEXT + version TEXT, + + UNIQUE (name, version) ); -- A py_module describes any python module file containing decorated Functions. -- Modules are tracked since they impact the global scope of function calls. -CREATE TABLE py_module(id INTEGER PRIMARY KEY NOT NULL, +CREATE TABLE py_module( + sha256 TEXT PRIMARY KEY NOT NULL, name TEXT NOT NULL, - code TEXT, - py_package INTEGER, - - FOREIGN KEY (py_package) REFERENCES py_package (id) ON UPDATE CASCADE + code TEXT, -- code doesn't have to be included, but should be used to create sha256 + py_package TEXT REFERENCES py_package ON UPDATE CASCADE ); -- A py_function just describes a function, without reference to its arguments. -- It can have inputs and outputs, which are described in the func_inputs and -- func_outputs children tables. -CREATE TABLE py_function(id INTEGER PRIMARY KEY NOT NULL, +CREATE TABLE py_function( + sha256 TEXT PRIMARY KEY NOT NULL, name TEXT NOT NULL, - py_module INTEGER NOT NULL, - - -- we use POSIX timestamps for time recording. - -- e.g. datetime.datetime.now().timestamp() - start_time REAL, - end_time REAL, - - FOREIGN KEY (py_module) REFERENCES py_module (id) ON UPDATE CASCADE + py_module TEXT NOT NULL REFERENCES py_module ON UPDATE CASCADE ); -CREATE TABLE py_function_input(id INTEGER PRIMARY KEY NOT NULL, +CREATE TABLE py_function_input( + uuid TEXT PRIMARY KEY NOT NULL, name TEXT NOT NULL, typename TEXT NOT NULL, - py_function INTEGER NOT NULL, + py_function TEXT NOT NULL REFERENCES py_function ON UPDATE CASCADE, position INTEGER, posonly BOOL, kwonly BOOL, - description TEXT, - - FOREIGN KEY (py_function) REFERENCES py_function (id) ON UPDATE CASCADE + description TEXT ); -CREATE TABLE py_function_output(id INTEGER PRIMARY KEY NOT NULL, +CREATE TABLE py_function_output( + uuid TEXT PRIMARY KEY NOT NULL, name TEXT, typename TEXT NOT NULL, - py_function INTEGER NOT NULL, + py_function TEXT NOT NULL REFERENCES py_function ON UPDATE CASCADE, position INTEGER NOT NULL, - description TEXT, - - FOREIGN KEY (py_function) REFERENCES py_function (id) ON UPDATE CASCADE + description TEXT ); @@ -271,47 +255,42 @@ CREATE TABLE py_function_output(id INTEGER PRIMARY KEY NOT NULL, -- particular set of inputs which are themselves data (see datum table). A task -- is executed in the context of a "program". Within a program, tasks are -- typically evaluated in a serial manner. -CREATE TABLE task(id INTEGER PRIMARY KEY NOT NULL, - program INTEGER NOT NULL, +CREATE TABLE task( + uuid TEXT PRIMARY KEY NOT NULL, + program TEXT NOT NULL REFERENCES program ON UPDATE CASCADE, - py_function INTEGER, -- func is NULL for some built-in functionality like "record" programs - - FOREIGN KEY (py_function) REFERENCES py_function (id) ON UPDATE CASCADE, - FOREIGN KEY (program) REFERENCES program (id) ON UPDATE CASCADE + -- py_function is NULL for some built-in functionality like "RECORD" programs + py_function TEXT REFERENCES py_function ON UPDATE CASCADE ); -- A datum is an object that is computed as the output of a task, given as a -- literal value in a config file, or loaded from a file. -CREATE TABLE datum(id INTEGER PRIMARY KEY NOT NULL, - -- provider_type describes where the datum came from. Choices are: - -- - COMPUTED: output of a decorated Function - -- - IMPORTED: imported output from a prior program - -- - LITERAL: literal value provided in a config file - -- Note that literals are implicitly imported (hash computed) at runtime +CREATE TABLE datum( + uuid TEXT PRIMARY KEY NOT NULL, + -- provider_type describes where the datum came from. Choices are: + -- - COMPUTED: output of a decorated Function + -- - IMPORTED: imported output from a prior program + -- - LITERAL: literal value provided in a config file + -- Note that literals are implicitly imported (hash computed) at runtime provider_type TEXT, -- task and task_output are only used when provider_type='COMPUTED' - task INTEGER, - task_output INTEGER, + task TEXT REFERENCES task ON UPDATE CASCADE, + task_output TEXT REFERENCES func_output ON UPDATE CASCADE, literal_json TEXT, -- if provider_type='LITERAL', a JSON representation of the value - typename TEXT NOT NULL, -- string representation of the data type - - FOREIGN KEY (task) REFERENCES task (id) ON UPDATE CASCADE, - FOREIGN KEY (task_output) REFERENCES func_output (id) ON UPDATE CASCADE + typename TEXT NOT NULL -- string representation of the data type ); -- A task_input records the version of a Datum that is passed to a function -CREATE TABLE task_input(id INTEGER PRIMARY KEY NOT NULL, - task INTEGER NOT NULL, - py_function_input INTEGER, -- if this was a python function, reference which input +CREATE TABLE task_input( + uuid TEXT PRIMARY KEY NOT NULL, + task TEXT NOT NULL REFERENCES task ON UPDATE CASCADE, + -- if this was a python function, reference which input + py_function_input TEXT REFERENCES py_function_input ON UPDATE CASCADE, - datum INTEGER NOT NULL, + datum TEXT NOT NULL REFERENCES datum ON UPDATE CASCADE, -- Data have versions to facilitate tracking non-const operations. If a datum -- is passed to a non-const operation, it must increment its internal -- version - datum_version INTEGER NOT NULL, - - FOREIGN KEY (task) REFERENCES task (id) ON UPDATE CASCADE, - FOREIGN KEY (py_function_input) REFERENCES py_function_input (id) ON UPDATE CASCADE, - FOREIGN KEY (datum) REFERENCES datum (id) ON UPDATE CASCADE + datum_version INTEGER NOT NULL ); diff --git a/src/nancy/store.py b/src/nancy/store.py index 380f2dd..bb1d005 100644 --- a/src/nancy/store.py +++ b/src/nancy/store.py @@ -4,16 +4,18 @@ from loguru import logger from . import db, environment, fs +from contextlib import contextmanager from dataclasses import dataclass import datetime import os from pathlib import Path import sqlite3 -from typing import Any, Optional, TypeVar, Type, Union +from typing import Any, Iterator, Optional, TypeVar, Type, Union +import uuid import warnings -@dataclass +@dataclass(slots=True) class Program: store: "Store" name: str @@ -23,40 +25,45 @@ class Program: start_time: Optional[datetime.datetime] = None evaluated: bool = False + uuid: str = "" + + def __post_init__(self) -> None: + self.uuid = str(uuid.uuid4()) + def __enter__(self) -> "Program": if self.evaluated: raise RuntimeError("Cannot re-enter a Program context") - assert self.store.conn is not None - cur = self.store.conn.cursor() - env = environment.Environment.find_or_insert(cur) - - cur.execute( - "INSERT INTO program VALUES (?, ?, ?, ?, ?, ?, ?)", - ( - None, # id INTEGER PRIMARY KEY NOT NULL, - self.name, # name TEXT, - # name of the program, usually written lowercase by calling - # code e.g. cnn_crossval - # -- we use POSIX timestamps for time recording. - # -- e.g. datetime.datetime.now().timestamp() - None, # start_time REAL, - None, # end_time REAL, - os.getpid(), # process_id INTEGER, -- host PID of python process on host OS - env.id, # environment INTEGER NOT NULL, - self.message, # user-defined message to help distinguish similar runs - ), - ) - self.id = cur.lastrowid - + env = environment.Environment.detect() + with self.store.committing() as cur: + env.maybe_insert(cur) + cur.execute( + "INSERT INTO program VALUES (?, ?, ?, ?, ?, ?, ?)", + ( + self.uuid, # uuid + self.name, # name TEXT, + # name of the program, usually written lowercase by calling + # code e.g. cnn_crossval + # -- we use POSIX timestamps for time recording. + # -- e.g. datetime.datetime.now().timestamp() + None, # start_time REAL, + None, # end_time REAL, + os.getpid(), # process_id INTEGER, -- host PID of python process on host OS + env.sha256.hex(), # environment INTEGER NOT NULL, + self.message, # user-defined message to help distinguish similar runs + ), + ) + self.id = cur.lastrowid self.start_time = datetime.datetime.now() - return self - def new_task(self, name: str, py_function_id: Optional[int] = None) -> int: + def new_task( + self, + name: str, + cur: sqlite3.Cursor, + py_function_id: Optional[int] = None, + ) -> int: """Create a new task and return its id""" - assert self.store.conn is not None - cur = self.store.conn.cursor() cur.execute( "INSERT INTO task VALUES (?, ?, ?)", (None, self.id, py_function_id), @@ -75,21 +82,20 @@ class Program: # record start and end times in store assert self.store.conn is not None - cur = self.store.conn.cursor() - cur.execute( - """ - UPDATE - program - SET - start_time = ?, - end_time = ? - WHERE - id = ? - """, - (self.start_time, end_time, self.id), - ) - cur.connection.commit() - self._evaluated = True # prevent re-running + with self.store.committing() as cur: + cur.execute( + """ + UPDATE + program + SET + start_time = ?, + end_time = ? + WHERE + uuid = ? + """, + (self.start_time, end_time, self.uuid), + ) + self.evaluated = True # prevent re-running assert self.start_time is not None elapsed = end_time - self.start_time logger.success( @@ -126,11 +132,14 @@ class Store: else: self.path = Path(directory) self.db_path = self.path / "nancy.db" + if conn is None: self.connect() else: self.conn = conn + self.store_uuid = self.find_store_uuid() + def copy(self: _StoreT, store_path: fs.PathStr) -> _StoreT: """Copy this store to a new store path""" assert self.conn is not None @@ -141,10 +150,28 @@ class Store: return self.__class__(store_path) def connect(self) -> sqlite3.Connection: - self.conn = sqlite3.connect(self.db_path) + self.conn = sqlite3.connect(self.db_path, isolation_level="DEFERRED") self.conn.cursor().execute("PRAGMA foreign_keys = ON;") return self.conn + @contextmanager + def committing(self) -> Iterator[sqlite3.Cursor]: + """Return a context manager that gives a cursor and commits on exit.""" + assert self.conn is not None + cur = self.conn.cursor() + yield cur + self.conn.commit() + + def find_store_uuid(self, cur: Optional[sqlite3.Cursor] = None) -> Optional[str]: + assert self.conn is not None + if cur is None: + cur = self.conn.cursor() + cur.execute( + 'SELECT value FROM local_metadata WHERE key == "store_uuid" LIMIT 1' + ) + res = cur.fetchone() + return None if res is None else res[0] + @classmethod def init( cls: Type[_StoreT], message: str, directory: Optional[fs.PathStr] = None @@ -163,14 +190,27 @@ class Store: f"File {db_path} exists. Refusing to re-initialize", ) # initialize a database in the target directory - conn = sqlite3.connect(db_path) - db.init_schema(conn.cursor()) + conn = sqlite3.connect(db_path, isolation_level="DEFERRED") + cur = conn.cursor() + db.init_schema(cur) + conn.commit() + cur.close() + new_store = cls(directory, conn) with new_store.program("INIT", message) as p: # set the timing to the actual times it took to initialize the db p.start_time = start_time + # generate a new UUID for this store + assert new_store.store_uuid is None + new_store.store_uuid = str(uuid.uuid4()) + with new_store.committing() as cur: + cur.execute( + 'INSERT INTO local_metadata VALUES ("store_uuid", ?)', + (new_store.store_uuid,), + ) + return new_store def make_readonly(self) -> None: @@ -345,24 +385,13 @@ class Store: cur = self.conn.cursor() with self.program("RECORD", message) as p: - # create a task for this operation - task_id = p.new_task("Store._record_recursive") + with self.committing() as cur: # entire record operation is one transaction + # create a task for this operation + task_id = p.new_task("Store._record_recursive", cur=cur) - # descend the diff, tracking parent filedir IDs, creating them and - # recording new versions of each, when necessary - self._record_recursive(diff, cur, source_task=task_id) - - -class StoreFile: - """Describes a file that is recorded in the store.""" - - def __init__(self, store: Store, rel_path: fs.PathStr): - self.store = store - self.rel_path = rel_path - - def save(self) -> None: - # call the appropriate save method - pass + # descend the diff, tracking parent filedir IDs, creating them and + # recording new versions of each, when necessary + self._record_recursive(diff, cur, source_task=task_id) def find_store(path: Union[str, "os.PathLike[str]"]) -> Optional[str]: diff --git a/src/nancy/user.py b/src/nancy/user.py index 0ad9265..f26c7a0 100644 --- a/src/nancy/user.py +++ b/src/nancy/user.py @@ -1,76 +1,59 @@ from . import machine +from dataclasses import dataclass +import hashlib import getpass import os import pwd import sqlite3 -from typing import NamedTuple, Optional, Type, TypeVar +from typing import Callable, NamedTuple, Optional, Type, TypeVar # see https://stackoverflow.com/questions/44640479/type-annotation-for-classmethod-returning-instance _UserT = TypeVar("_UserT", bound="User") -class User(NamedTuple): - id: Optional[int] # if not None, this is `id` in the `machine` table +@dataclass(frozen=True, slots=True) +class User: username: str userid: int fullname: str machine: machine.Machine + sha256: bytes = b"" - @classmethod - def find_or_insert( - cls: Type[_UserT], cur: sqlite3.Cursor, user: Optional[_UserT] = None - ) -> _UserT: - """Given a DB cursor, find or create row in user table and fill""" - if user is None: - user = cls.detect() + def __post_init__(self) -> None: + m = hashlib.sha256() + upstr: Callable[[str], None] = lambda s: m.update(bytes(s, "utf-8")) + upint: Callable[[int], None] = lambda n: m.update( + n.to_bytes((n.bit_length() + 7) // 8, byteorder="big"), + ) + upstr(self.username) + upint(self.userid) + upstr(self.fullname) + m.update(self.machine.sha256) + # circumvent freezing to set sha256 + object.__setattr__(self, "sha256", m.digest()) - m = machine.Machine.find_or_insert(cur) + def maybe_insert(self, cur: sqlite3.Cursor) -> str: + """Insert if not exists, without committing.""" + self.machine.maybe_insert(cur) - user = user._replace(machine=m) + key = self.sha256.hex() - # insert or ignore, handle each case to set id cur.execute( """ - SELECT - id - FROM - user - WHERE - username = ? AND - userid = ? AND - fullname = ? AND - machine = ? - LIMIT 1 + INSERT OR IGNORE INTO user VALUES (?,?,?,?,?); """, ( - user.username, - user.userid, - user.fullname, - user.machine.id, + key, + self.username, + self.userid, + self.fullname, + self.machine.sha256.hex(), ), ) - res = cur.fetchone() - if res is None: - cur.execute( - """ - INSERT INTO user VALUES (?,?,?,?,?); - """, - ( - user.id, - user.username, - user.userid, - user.fullname, - user.machine.id, - ), - ) - id = cur.lastrowid - cur.connection.commit() - else: - id = res[0] - return user._replace(id=id) + return key @classmethod def detect(cls: Type[_UserT]) -> _UserT: @@ -81,12 +64,9 @@ class User(NamedTuple): # TODO: will this fail on Windows/OSX? fullname = pwd.getpwuid(os.getuid()).pw_gecos - m = machine.Machine.detect() - return cls( - None, getpass.getuser(), os.getuid(), fullname, - m, + machine=machine.Machine.detect(), ) diff --git a/src/nancy/version.py b/src/nancy/version.py index 4913370..03642c1 100644 --- a/src/nancy/version.py +++ b/src/nancy/version.py @@ -1,3 +1,3 @@ -from .db import schema_version # NOQA +from .db import schema_version as schema_version # NOQA __version__ = "0.1.0" diff --git a/tests/test_db.py b/tests/test_db.py index fafba8a..4a99d5f 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -6,10 +6,11 @@ import pytest import datetime import os import sqlite3 +from typing import Iterator @pytest.fixture -def temp_db(): +def temp_db() -> Iterator[sqlite3.Cursor]: """Create an in-memory database that follow's the nancy schema""" with sqlite3.connect(":memory:") as conn: cur = conn.cursor() @@ -18,47 +19,42 @@ def temp_db(): db.init_schema(cur) + cur.execute( + 'INSERT INTO local_metadata VALUES ("store_uuid", ?)', + ("78dc0b93-0e22-45ee-ae90-49d3575dd70f",), + ) + yield cur @pytest.fixture -def insert_machine(temp_db): +def insert_machine(temp_db: sqlite3.Cursor) -> sqlite3.Cursor: cur = temp_db cur.executemany( - "INSERT INTO machine VALUES " "(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + "INSERT INTO machine VALUES " "(?, ?, ?, ?, ?, ?)", [ ( - None, # id INTEGER PRIMARY KEY NOT NULL, - "a5d97c08a15c4db69f5fded523a1bfe3", # machine_id TEXT, -- platform-dependent unique hardware id + "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", # sha256 + "a5d97c08a15c4db69f5fded523a1bfe3", # machine_id TEXT "lucky", # hostname TEXT, -- platform.node(): 'lucky' - "", # processor TEXT, -- platform.processor(): "Linux", # system TEXT, -- platform.system(): 'Linux' - "5.15.64", # release TEXT, -- platform.release(): '5.15.64' - "aarch64", # machine TEXT, -- platform.machine(): 'x86_64' - "EDT", # timezone TEXT, -- timezone, for interpreting event times - "", # freedesktop_os_release TEXT, -- requires python 3.10 - "", # win32_ver TEXT, -- platform.win32_ver() as JSON - "", # mac_ver TEXT -- platform.mac_ver() as JSON + "x86_64", # cpu_type TEXT, -- platform.machine(): + "i386", # processor TEXT, -- platform.processor(): ), ( - None, # id INTEGER PRIMARY KEY NOT NULL, - "afc9b06a23b74341b29d42b8312a4f8a", + "2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae", # sha256 + "b5d97c08a15c4db69f5fded523a1bfe3", # machine_id TEXT "a100", # hostname TEXT, -- platform.node(): 'lucky' - "", # processor TEXT, -- platform.processor(): "Linux", # system TEXT, -- platform.system(): 'Linux' - "5.15.63", # release TEXT, -- platform.release(): '5.15.64' - "x86_64", # machine TEXT, -- platform.machine(): 'x86_64' - "EST", # timezone TEXT, -- timezone, for interpreting event times - "", # freedesktop_os_release TEXT, -- requires python 3.10 - "", # win32_ver TEXT, -- platform.win32_ver() as JSON - "", # mac_ver TEXT -- platform.mac_ver() as JSON + "x86_64", # cpu_type TEXT, -- platform.machine(): + "i386", # processor TEXT, -- platform.processor(): ), ], ) return cur -def test_insert_machine(insert_machine): +def test_insert_machine(insert_machine: sqlite3.Cursor) -> None: cur = insert_machine cur.execute("SELECT * FROM machine") machines = cur.fetchall() @@ -66,45 +62,45 @@ def test_insert_machine(insert_machine): @pytest.fixture -def insert_user(insert_machine): +def insert_user(insert_machine: sqlite3.Cursor) -> sqlite3.Cursor: cur = insert_machine cur.executemany( - "INSERT INTO user VALUES " "(?, ?, ?, ?, ?)", + "INSERT INTO user VALUES (?, ?, ?, ?, ?)", [ - ( - None, # id INTEGER PRIMARY KEY NOT NULL, + ( # jacob@lucky + "f2ee9db3526d36b4346980f12a49cb404b924cf784dbd3d5db8a362e363a8070", # sha256 "jacob", # username TEXT NOT NULL, 101, # userid INTEGER, "Jacob Hinkle", # fullname TEXT, - 1, # machine INTEGER NOT NULL, + "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", # machine ), - ( - None, # id INTEGER PRIMARY KEY NOT NULL, + ( # jacob@a100 + "3219c598f4e6e84bea3d0c295f0d2c35f3ce01747f4839fb698b32646b037d3c", # sha256 "jacob", # username TEXT NOT NULL, 10301, # userid INTEGER, "Jacob Hinkle", # fullname TEXT, - 2, # machine INTEGER NOT NULL, + "2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae", # machine ), - ( - None, # id INTEGER PRIMARY KEY NOT NULL, + ( # bob@a100 + "99389acaef44ed99c16bcaf1bff0c261b35ad110f0544092f711a1354e616f61", # sha256 "bob", # username TEXT NOT NULL, 2035, # userid INTEGER, "Just Bob", # fullname TEXT, - 2, # machine INTEGER NOT NULL, + "2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae", # machine ), ], ) return cur -def test_insert_user(insert_user): +def test_insert_user(insert_user: sqlite3.Cursor) -> None: cur = insert_user cur.execute("SELECT * FROM user") users = cur.fetchall() assert len(users) == 3 -def test_invalid_user_machine(insert_user): +def test_invalid_user_machine(insert_user: sqlite3.Cursor) -> None: cur = insert_user with pytest.raises(sqlite3.IntegrityError): # should fail foreign key constraint @@ -133,66 +129,44 @@ def test_invalid_user_machine(insert_user): @pytest.fixture -def insert_store(insert_machine): +def insert_store(insert_machine: sqlite3.Cursor) -> sqlite3.Cursor: import uuid cur = insert_machine cur.executemany( - "INSERT INTO store VALUES " "(?, ?, ?, ?, ?)", + "INSERT INTO store VALUES " "(?)", [ - ( - None, # id INTEGER PRIMARY KEY NOT NULL, - 1, # machine INTEGER, - "/path/to/first/store", # dbpath TEXT NOT NULL, - str(uuid.uuid4()), # -- UUID generated by str(uuid.uuid4()) - False, # imported BOOL, - ), - ( - None, # id INTEGER PRIMARY KEY NOT NULL, - 1, # machine INTEGER, - "/path/to/dependency/store", # dbpath TEXT NOT NULL, - str(uuid.uuid4()), # -- UUID generated by str(uuid.uuid4()) - True, # imported BOOL, - ), - ( - None, # id INTEGER PRIMARY KEY NOT NULL, - 2, # machine INTEGER, - # same path but on a separate machine - "/path/to/first/store", # dbpath TEXT NOT NULL, - str(uuid.uuid4()), # -- UUID generated by str(uuid.uuid4()) - True, # imported BOOL, - ), + ("63c2a84a-524f-46ad-b512-b99e80f7385b",), + ("3be35ff6-6a7b-49cf-ba3e-c610b781b54e",), + ("b8f685d8-2322-403d-a59c-fb8301229788",), ], ) return cur @pytest.fixture -def insert_directories(insert_store): +def insert_directories(insert_store: sqlite3.Cursor) -> sqlite3.Cursor: cur = insert_store cur.executemany( - "INSERT INTO filedir VALUES " "(?, ?, ?, ?, ?)", + "INSERT INTO filedir VALUES (?, ?, ?, ?)", [ ( - None, # id INTEGER PRIMARY KEY NOT NULL, - 1, # store INTEGER NOT NULL, - ".", # filename TEXT, -- only a filename, not a path - None, # parent INTEGER REFERENCES filedir ON UPDATE CASCADE, - False, # frozen BOOL NOT NULL, + "71e75275-0847-4e47-9df5-1f2bbe01da91", # uuid + "63c2a84a-524f-46ad-b512-b99e80f7385b", # store + ".", # filename + None, # parent ), ( - None, # id INTEGER PRIMARY KEY NOT NULL, - 1, # store INTEGER NOT NULL, - "foo", # filename TEXT, -- only a filename, not a path - 1, # parent INTEGER REFERENCES filedir ON UPDATE CASCADE, - False, # frozen BOOL NOT NULL, + "6051a251-d38a-4d8c-ba57-ac74023ec2f4", # uuid + "63c2a84a-524f-46ad-b512-b99e80f7385b", # store + "foo", # filename + "71e75275-0847-4e47-9df5-1f2bbe01da91", # parent ), ( - None, # id INTEGER PRIMARY KEY NOT NULL, - 2, # store INTEGER NOT NULL, - ".", # filename TEXT, -- only a filename, not a path - None, # parent INTEGER REFERENCES filedir ON UPDATE CASCADE, - False, # frozen BOOL NOT NULL, + "5f74b5f2-eac8-4a86-90f6-4f7c7981564d", # uuid + "3be35ff6-6a7b-49cf-ba3e-c610b781b54e", # store + ".", # filename + None, # parent ), ], ) @@ -200,34 +174,34 @@ def insert_directories(insert_store): "INSERT INTO filedir_version VALUES " "(?, ?, ?, ?, ?, ?, ?, ?, ?)", [ ( - None, # id INTEGER PRIMARY KEY NOT NULL, - 1, # INTEGER REFERENCES filedir ON UPDATE CASCADE, -- parent filedir entry + "baad81ef-7cc6-48df-973c-e9f32ceda19a", # uuid + "71e75275-0847-4e47-9df5-1f2bbe01da91", # filedir datetime.datetime.now().timestamp(), "DIR", # filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details False, # deleted BOOL NOT NULL, -- set True when recording a deleted file - "drwxrwxr-x", # unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' + "drwxrwxr-x", # perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' None, # symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target. I.e. this is the "content" of the symlink. "a84ed33864d06615a87bc8da5258d841163f1e7969367ecd07b041ae1a18febd", # sha256 TEXT, None, # source_task INTEGER, ), ( - None, # id INTEGER PRIMARY KEY NOT NULL, - 1, # INTEGER REFERENCES filedir ON UPDATE CASCADE, -- parent filedir entry + "c4037a74-88cd-4a1d-b158-4ebebb25f10d", # uuid + "71e75275-0847-4e47-9df5-1f2bbe01da91", # filedir datetime.datetime.now().timestamp(), "DIR", # filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details False, # deleted BOOL NOT NULL, -- set True when recording a deleted file - "drwxrwxr-x", # unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' + "drwxrwxr-x", # perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' None, # symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target. I.e. this is the "content" of the symlink. "a84ed33864d06615a87bc8da5258d841163f1e7969367ecd07b041ae1a18febd", # sha256 TEXT, None, # source_task INTEGER, ), ( - None, # id INTEGER PRIMARY KEY NOT NULL, - 1, # INTEGER REFERENCES filedir ON UPDATE CASCADE, -- parent filedir entry + "dcfbe086-3c4b-43b8-a550-974610b86267", # uuid + "5f74b5f2-eac8-4a86-90f6-4f7c7981564d", # filedir datetime.datetime.now().timestamp(), "DIR", # filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details False, # deleted BOOL NOT NULL, -- set True when recording a deleted file - "drwxrwxr-x", # unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' + "drwxrwxr-x", # perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' None, # symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target. I.e. this is the "content" of the symlink. "a84ed33864d06615a87bc8da5258d841163f1e7969367ecd07b041ae1a18febd", # sha256 TEXT, None, # source_task INTEGER, @@ -237,18 +211,17 @@ def insert_directories(insert_store): return cur -def test_crossstore_directory_insert(insert_directories): +def test_crossstore_directory_insert(insert_directories: sqlite3.Cursor) -> None: cur = insert_directories with pytest.raises(sqlite3.IntegrityError): # declaring directory as belonging to store 2, but parent's store is 1 cur.execute( - "INSERT INTO filedir VALUES " "(?, ?, ?, ?, ?)", + "INSERT INTO filedir VALUES (?, ?, ?, ?)", ( - None, # id INTEGER PRIMARY KEY NOT NULL, - 2, # store INTEGER NOT NULL, - "some_dir", # filename TEXT, -- only a filename, not a path - 1, # parent INTEGER REFERENCES filedir ON UPDATE CASCADE, - False, # frozen BOOL NOT NULL, + "b9774e78-2646-476e-b4a8-f6df0b10ba8e", # uuid + "3be35ff6-6a7b-49cf-ba3e-c610b781b54e", # store + "some_dir", # filename + "baad81ef-7cc6-48df-973c-e9f32ceda19a", # parent ), ) for row in cur.connection.iterdump(): @@ -258,7 +231,7 @@ def test_crossstore_directory_insert(insert_directories): @pytest.fixture -def insert_files(insert_directories): +def insert_files(insert_directories: sqlite3.Cursor) -> sqlite3.Cursor: cur = insert_directories cur.execute("SELECT COUNT(*) FROM filedir") (nprev,) = cur.fetchone() @@ -328,7 +301,7 @@ def insert_files(insert_directories): # TODO: This test is disabled until triggers are added to check for these types # of constraints. These became much more complicated to check when I added # filedir_version. -def disabled_test_nondir_parent_directory_insert(insert_files): +def disabled_test_nondir_parent_directory_insert(insert_files: sqlite3.Cursor) -> None: cur = insert_files with pytest.raises(sqlite3.IntegrityError): # declaring parent as 5, but 5 is a file (plots.png) diff --git a/tests/test_store.py b/tests/test_store.py index a3126ec..8429516 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -1,19 +1,23 @@ +"""Test functionality of the Store class.""" +from nancy import store + import os from pathlib import Path import pytest import sys import tempfile +from typing import Iterator @pytest.fixture -def bare_dir(): +def bare_dir() -> Iterator[Path]: """Create an emptry temp directory""" with tempfile.TemporaryDirectory(prefix="nancy_testdir") as d: yield Path(d) @pytest.fixture -def filled_dir(bare_dir): +def filled_dir(bare_dir: Path) -> Path: open(bare_dir / "a.txt", "w").write("foo") os.makedirs(bare_dir / "stats") open(bare_dir / "stats" / "metrics.csv", "w").write("bar,baz") @@ -22,24 +26,23 @@ def filled_dir(bare_dir): return bare_dir -def test_record_untracked_dir(filled_dir): +def test_record_untracked_dir(filled_dir: Path) -> None: from nancy.cli.record import record - record(filled_dir, message="test_record_untracked_dir") + record(store_path=filled_dir, message="test_record_untracked_dir") @pytest.fixture -def store(): - from nancy import store - +def empty_store() -> Iterator[store.Store]: s = store.Store.init(message="test init") yield s -def test_schema_version_match(store): +def test_schema_version_match(empty_store: store.Store) -> None: from nancy.version import schema_version - cur = store.conn.cursor() + assert empty_store.conn is not None + cur = empty_store.conn.cursor() (db_schema_ver,) = cur.execute("PRAGMA user_version;").fetchone() assert schema_version == db_schema_ver