Change to using UUID and SHA256 as DB keys

This is important for distributed settings, and will make merging
databases much simpler. This change is pretty extensive and includes a
lot of other stuff like moving some fields between tables, and
introducing the `local_metadata` table which is a key-value store and
holds the uuid of the "local" store, i.e. the one corresponding to the
directory holding that particular nancy.db.
This commit is contained in:
Jacob Hinkle 2022-10-10 15:14:47 -04:00
parent c4648ec042
commit 07ccef601c
13 changed files with 425 additions and 468 deletions

View File

@ -5,7 +5,7 @@ description = "Composable tracking of scientific data provenance"
authors = ["Jacob Hinkle <jacob.hinkle@jhink.org>"] authors = ["Jacob Hinkle <jacob.hinkle@jhink.org>"]
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.8" python = "^3.10"
click = "^8.1.3" click = "^8.1.3"
colorama = "^0.4.5" colorama = "^0.4.5"
loguru = "^0.6.0" loguru = "^0.6.0"
@ -28,7 +28,7 @@ build-backend = "poetry.core.masonry.api"
[tool.tox] [tool.tox]
legacy_tox_ini = """ legacy_tox_ini = """
[tox] [tox]
envlist = py38,py39,py310,mypy envlist = mypy,py310
isolated_build = true isolated_build = true
[testenv] [testenv]
@ -40,7 +40,10 @@ commands =
pytest --cov src/nancy pytest --cov src/nancy
[testenv:mypy] [testenv:mypy]
deps = mypy deps =
mypy
pytest
commands = commands =
mypy --strict -p nancy mypy --strict --ignore-missing-imports --no-incremental -p nancy
mypy --strict --no-incremental tests/
""" """

View File

@ -48,13 +48,16 @@ def print_diff(
dirstr = ( dirstr = (
(filetypecolors["DIR"] + dname + "/" + reset) if dname != "" else "" (filetypecolors["DIR"] + dname + "/" + reset) if dname != "" else ""
) )
fname = filetypecolors.get(entry.filetype, "") + fname + reset assert entry.filetype is not None
fname = filetypecolors.get(str(entry.filetype), "") + fname + reset
if entry.filetype == "LNK": # append symlink target if entry.filetype == fs.FileType.LNK: # append symlink target
assert entry.symlink_target is not None
fname += " -> " + entry.symlink_target fname += " -> " + entry.symlink_target
relpath = dirstr + fname relpath = dirstr + fname
assert entry.sha256 is not None
hashchange = ( hashchange = (
(hashcolor + entry.sha256.hex() + reset + " " + changetags[tag]) (hashcolor + entry.sha256.hex() + reset + " " + changetags[tag])
if show_hashes if show_hashes

View File

@ -3,11 +3,12 @@ import click
from .. import store from .. import store
import os import os
from typing import List
@click.command() @click.command()
@click.argument("directory") @click.argument("directory")
def freeze(directory): def freeze(directory: str) -> None:
""" """
Initialize tracking in a directory or freeze a tracked directory. Initialize tracking in a directory or freeze a tracked directory.
@ -25,12 +26,12 @@ def freeze(directory):
else: # this is an existing store else: # this is an existing store
s = store.Store(directory) s = store.Store(directory)
s.freeze() # s.freeze()
@click.command() @click.command()
@click.argument("files", nargs=-1) # , help="Files or directories to thaw.") @click.argument("files", nargs=-1) # , help="Files or directories to thaw.")
def thaw(files): def thaw(files: List[str]) -> None:
""" """
Enable manual alteration of files within a tracked directory. Enable manual alteration of files within a tracked directory.

View File

@ -32,7 +32,7 @@ def record(
if not os.path.exists(os.path.join(store_path, "nancy.db")): if not os.path.exists(os.path.join(store_path, "nancy.db")):
# this is a new store # this is a new store
logger.info(f"Initializing new store in {store_path}...") logger.info(f"Initializing new store in {store_path}...")
s = store.Store.init(store_path) s = store.Store.init(message=message, directory=store_path)
else: # this is an existing store else: # this is an existing store
s = store.Store(store_path) s = store.Store(store_path)
@ -78,7 +78,7 @@ def record(
"of given paths. If given the path to a non-store directory, a new " "of given paths. If given the path to a non-store directory, a new "
"store is initialized there.", "store is initialized there.",
) )
def record_cli(show_hashes, no_color, message, store): def record_cli(show_hashes: bool, no_color: bool, message: str, store: str) -> None:
""" """
Initialize tracking or record changes to a tracked directory. Initialize tracking or record changes to a tracked directory.
""" """

View File

@ -1,83 +1,79 @@
from . import user from . import user
from dataclasses import dataclass
import hashlib
import json import json
import os import os
import platform import platform
import sqlite3 import sqlite3
import sys import sys
from typing import NamedTuple, Optional, TypeVar, Type import time
from typing import Callable, Optional, TypeVar, Type
# see https://stackoverflow.com/questions/44640479/type-annotation-for-classmethod-returning-instance # see https://stackoverflow.com/questions/44640479/type-annotation-for-classmethod-returning-instance
_EnvironmentT = TypeVar("_EnvironmentT", bound="Environment") _EnvironmentT = TypeVar("_EnvironmentT", bound="Environment")
class Environment(NamedTuple): @dataclass(frozen=True, slots=True)
id: Optional[int] class Environment:
envvars_json: str envvars_json: str
python_implementation: str python_implementation: str
python_strversion: str python_strversion: str
python_hexversion: int python_hexversion: int
user: user.User user: user.User
timezone: str
release: str
freedesktop_os_release: str
win32_ver: str
mac_ver: str
sha256: bytes = b""
@classmethod def __post_init__(self) -> None:
def find_or_insert( m = hashlib.sha256()
cls: Type[_EnvironmentT], upstr: Callable[[str], None] = lambda s: m.update(bytes(s, "utf-8"))
cur: sqlite3.Cursor, upint: Callable[[int], None] = lambda n: m.update(
env: Optional[_EnvironmentT] = None, n.to_bytes((n.bit_length() + 7) // 8, byteorder="big"),
) -> _EnvironmentT: )
"""Given a DB cursor, find or create row in environment table and fill""" upstr(self.envvars_json)
if env is None: upstr(self.python_implementation)
env = cls.detect() upstr(self.python_strversion)
upint(self.python_hexversion)
m.update(self.user.sha256)
upstr(self.timezone)
upstr(self.release)
upstr(self.freedesktop_os_release)
upstr(self.win32_ver)
upstr(self.mac_ver)
# circumvent freezing to set sha256
object.__setattr__(self, "sha256", m.digest())
u = user.User.find_or_insert(cur) def maybe_insert(self, cur: sqlite3.Cursor) -> str:
"""Insert if not exists, without committing."""
self.user.maybe_insert(cur)
env = env._replace(user=u) key = self.sha256.hex()
# insert or ignore, handle each case to set id
cur.execute( cur.execute(
""" """
SELECT INSERT OR IGNORE INTO environment VALUES (?,?,?,?,?,?,?,?,?,?,?);
id
FROM
environment
WHERE
envvars_json = ? AND
python_implementation = ? AND
python_strversion = ? AND
python_hexversion = ? AND
user = ?
LIMIT 1
""", """,
( (
env.envvars_json, key,
env.python_implementation, self.envvars_json,
env.python_strversion, self.python_implementation,
env.python_hexversion, self.python_strversion,
env.user.id, self.python_hexversion,
self.user.sha256.hex(),
self.timezone,
self.release,
self.freedesktop_os_release,
self.win32_ver,
self.mac_ver,
), ),
) )
res = cur.fetchone()
if res is None:
cur.execute(
"""
INSERT INTO environment VALUES (?,?,?,?,?,?);
""",
(
env.id,
env.envvars_json,
env.python_implementation,
env.python_strversion,
env.python_hexversion,
env.user.id,
),
)
id = cur.lastrowid
cur.connection.commit()
else:
id = res[0]
return env._replace(id=id) return key
@classmethod @classmethod
def detect(cls: Type[_EnvironmentT]) -> _EnvironmentT: def detect(cls: Type[_EnvironmentT]) -> _EnvironmentT:
@ -87,11 +83,22 @@ class Environment(NamedTuple):
""" """
u = user.User.detect() u = user.User.detect()
fdor = ""
try:
fdor = json.dumps(platform.freedesktop_os_release())
except AttributeError:
# freedesktop_os_release only available for python >= 3.10
fdor = ""
return cls( return cls(
None,
json.dumps(dict(os.environ)), json.dumps(dict(os.environ)),
platform.python_implementation(), platform.python_implementation(),
sys.version, sys.version,
sys.hexversion, sys.hexversion,
u, u,
timezone=time.tzname[time.daylight],
release=platform.release(),
freedesktop_os_release=fdor,
win32_ver=json.dumps(platform.win32_ver()),
mac_ver=json.dumps(platform.mac_ver()),
) )

View File

@ -203,7 +203,7 @@ class FSEntry:
) )
s = filestat.st_mode s = filestat.st_mode
children = [] children: List[FSEntry] = []
symlink_target: Optional[Union[str, bytes]] = None symlink_target: Optional[Union[str, bytes]] = None
if os.path.islink(path): if os.path.islink(path):
# Check links first, since it is not exclusive with dir or file checks # Check links first, since it is not exclusive with dir or file checks
@ -518,7 +518,7 @@ class FSDiff:
allnames = set(list(Alist.keys()) + list(Blist.keys())) allnames = set(list(Alist.keys()) + list(Blist.keys()))
modified_children = [ modified_children: List[FSDiff] = [
cls.compute( cls.compute(
Alist.get(n, None), Alist.get(n, None),
Blist.get(n, None), Blist.get(n, None),

View File

@ -1,104 +1,83 @@
from typing import NamedTuple, Optional, Type, TypeVar from dataclasses import dataclass
import hashlib
import json import json
import platform import platform
import sqlite3 import sqlite3
import time import time
from typing import Callable, NamedTuple, Optional, Type, TypeVar
# see https://stackoverflow.com/questions/44640479/type-annotation-for-classmethod-returning-instance # see https://stackoverflow.com/questions/44640479/type-annotation-for-classmethod-returning-instance
_MachineT = TypeVar("_MachineT", bound="Machine") _MachineT = TypeVar("_MachineT", bound="Machine")
class Machine(NamedTuple): @dataclass(frozen=True, slots=True)
id: Optional[int] class Machine:
machine_id: Optional[str] machine_id: Optional[str]
hostname: str hostname: str
processor: str
system: str system: str
release: str
cpu_type: str cpu_type: str
timezone: str processor: str
freedesktop_os_release: str sha256: bytes = b""
win32_ver: str
mac_ver: str
@classmethod def __post_init__(self) -> None:
def find_or_insert( """Derive sha256 from other attributes."""
cls: Type[_MachineT], cur: sqlite3.Cursor, machine: Optional[_MachineT] = None m = hashlib.sha256()
) -> _MachineT: upstr: Callable[[str], None] = lambda s: m.update(bytes(s, "utf-8"))
"""Given a DB cursor, find or create row in machine table and fill""" if self.machine_id is not None:
if machine is None: upstr(self.machine_id)
machine = cls.detect() upstr(self.hostname)
upstr(self.processor)
upstr(self.system)
upstr(self.cpu_type)
# circumvent freezing to set sha256
object.__setattr__(self, "sha256", m.digest())
def maybe_insert(self, cur: sqlite3.Cursor) -> str:
"""Insert if not exists, without committing."""
key = self.sha256.hex()
# insert or ignore, handle each case to set id
cur.execute( cur.execute(
""" """
SELECT INSERT OR IGNORE INTO machine VALUES (?,?,?,?,?,?);
id
FROM
machine
WHERE
machine_id = ? AND
hostname = ? AND
processor = ? AND
system = ? AND
release = ? AND
cpu_type = ? AND
timezone = ? AND
freedesktop_os_release = ? AND
win32_ver = ? AND
mac_ver = ?
LIMIT 1
""", """,
machine[1:], (
key,
self.machine_id,
self.hostname,
self.system,
self.cpu_type,
self.processor,
),
) )
res = cur.fetchone()
if res is None:
cur.execute(
"""
INSERT INTO machine VALUES (?,?,?,?,?,?,?,?,?,?,?);
""",
machine,
)
id = cur.lastrowid
cur.connection.commit()
else:
id = res[0]
return machine._replace(id=id) return key
@classmethod @classmethod
def detect(cls: Type[_MachineT]) -> _MachineT: def detect(cls: Type[_MachineT]) -> _MachineT:
"""Formats machine-specific information into a MachineInfo object. """Format machine-specific information into a MachineInfo object.
Note that 'MachineInfo' objects are properly formatted to be inserted into Note that 'MachineInfo' objects are properly formatted to be inserted into
the `machine` table. the `machine` table.
Raises:
NotImplementedError: if system != "Linux"
""" """
fdor = ""
try:
fdor = json.dumps(platform.freedesktop_os_release())
except AttributeError:
# freedesktop_os_release only available for python >= 3.10
fdor = ""
system = platform.system() system = platform.system()
mid = None mid = None
if system == "Linux": if system.lower() == "linux":
try: try:
mid = open("/etc/machine-id", "r").read() mid = open("/etc/machine-id", "r").read()
except FileNotFoundError: except FileNotFoundError:
pass pass
else:
raise NotImplementedError()
return cls( return cls(
None,
machine_id=mid, machine_id=mid,
hostname=platform.node(), hostname=platform.node(),
processor=platform.processor(),
system=system, system=system,
release=platform.release(),
cpu_type=platform.machine(), cpu_type=platform.machine(),
timezone=time.tzname[time.daylight], processor=platform.processor(),
freedesktop_os_release=fdor,
win32_ver=json.dumps(platform.win32_ver()),
mac_ver=json.dumps(platform.mac_ver()),
) )

View File

@ -8,6 +8,15 @@ PRAGMA foreign_keys=ON;
-- facilitate easy importing of one database into another -- facilitate easy importing of one database into another
-- Simple key/value table describing _this_ store (not imported ones).
-- In particular, the key "local_store" should be the UUID of the store in this
-- directory.
CREATE TABLE local_metadata(
key TEXT PRIMARY KEY NOT NULL,
value TEXT
);
-- This lets us enable or disable triggers. For example, when importing we need -- This lets us enable or disable triggers. For example, when importing we need
-- to update some tables to account for changing primary keys using ON UPDATE -- to update some tables to account for changing primary keys using ON UPDATE
-- CASCADE. However, some dependent tables like filedir may have a trigger that -- CASCADE. However, some dependent tables like filedir may have a trigger that
@ -18,58 +27,38 @@ PRAGMA foreign_keys=ON;
-- Note that entries in this table only need to be inserted when we know we have -- Note that entries in this table only need to be inserted when we know we have
-- a need to disable the trigger. Otherwise, this table can be ignored. -- a need to disable the trigger. Otherwise, this table can be ignored.
CREATE TABLE triggers( CREATE TABLE triggers(
name TEXT PRIMARY KEY, name TEXT PRIMARY KEY NOT NULL,
enabled BOOL enabled BOOL
); );
-- We track host machines, so that we can give better information about file -- We track host machines, so that we can give better information about file
-- locations. This also allows us to see whether platform-dependent behavior may -- locations. This also allows us to see whether platform-dependent behavior may
-- come into play. Note that these are not reliable identifiers for purposes of -- come into play.
-- tracking down data, but may be helpful context. -- A row of this table should describe the aspects of a node that are unlikely
CREATE TABLE machine(id INTEGER PRIMARY KEY NOT NULL, -- to change during regular software updates. For example, the hostname and CPU
-- type are included but not the kernel version. Software that changes due to
-- updates should be included in the "environment" table instead.
CREATE TABLE machine(
sha256 TEXT PRIMARY KEY NOT NULL,
machine_id TEXT, -- platform-dependent unique hardware id machine_id TEXT, -- platform-dependent unique hardware id
-- Linux: open('/etc/machine-id', 'r').read() (assumes systemd) -- Linux: open('/etc/machine-id', 'r').read() (assumes systemd)
-- OSX: `ioreg -rd1 -c IOPlatformExpertDevice | grep IOPlatformUUID` -- OSX: `ioreg -rd1 -c IOPlatformExpertDevice | grep IOPlatformUUID | awk '{$print $3}' | tr -d \"`
-- Windows: `reg query HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Cryptography /v MachineGuid` -- Windows: `reg query HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Cryptography /v MachineGuid`
hostname TEXT, -- platform.node(): 'lucky' hostname TEXT, -- platform.node(): 'lucky'
processor TEXT, -- platform.processor():
system TEXT, -- platform.system(): 'Linux' system TEXT, -- platform.system(): 'Linux'
release TEXT, -- platform.release(): '5.15.64'
cpu_type TEXT, -- platform.machine(): 'x86_64' cpu_type TEXT, -- platform.machine(): 'x86_64'
timezone TEXT, -- timezone, for interpreting event times processor TEXT -- platform.processor():
freedesktop_os_release TEXT, -- requires python 3.10
-- platform.freedesktop_os_release() as JSON
-- "{'NAME': 'NixOS', 'ID': 'nixos',
-- 'PRETTY_NAME': 'NixOS 22.05 (Quokka)', 'BUG_REPORT_URL':
-- 'https://github.com/NixOS/nixpkgs/issues', 'BUILD_ID':
-- '22.05.20220902.67e4507', 'DOCUMENTATION_URL':
-- 'https://nixos.org/learn.html', 'HOME_URL': 'https://nixos.org/', 'LOGO':
-- 'nix-snowflake', 'SUPPORT_URL': 'https://nixos.org/community.html',
-- 'VERSION': '22.05 (Quokka)', 'VERSION_CODENAME': 'quokka', 'VERSION_ID':
-- '22.05'}"
win32_ver TEXT, -- platform.win32_ver() as JSON
mac_ver TEXT, -- platform.mac_ver() as JSON
-- disallow duplicate rows
UNIQUE(hostname, processor, system, release, cpu_type, timezone,
freedesktop_os_release, win32_ver, mac_ver)
); );
-- Programs are run by users on machines -- Programs are run by users on machines
CREATE TABLE user(id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE user(
sha256 TEXT PRIMARY KEY NOT NULL,
username TEXT NOT NULL, -- getpass.getuser() username TEXT NOT NULL, -- getpass.getuser()
userid INTEGER, -- os.getuid() userid INTEGER, -- os.getuid()
fullname TEXT, -- on Linux/OSX: pwd.getpwuid(os.getuid()).pw_gecos fullname TEXT, -- on Linux/OSX: pwd.getpwuid(os.getuid()).pw_gecos
-- on Windows: see https://stackoverflow.com/questions/21766954/how-to-get-windows-users-full-name-in-python -- on Windows: see https://stackoverflow.com/questions/21766954/how-to-get-windows-users-full-name-in-python
machine INTEGER NOT NULL, machine TEXT NOT NULL REFERENCES machine ON UPDATE CASCADE
-- usernames and fullnames change rarely, but it happens. A more restrictive
-- constraint would not enable tracking of this info. The following only
-- guarantees no duplicate full rows.
UNIQUE(username, userid, fullname, machine),
FOREIGN KEY (machine) REFERENCES machine (id) ON UPDATE CASCADE
); );
@ -88,41 +77,32 @@ CREATE TABLE user(id INTEGER PRIMARY KEY NOT NULL,
-- Client code should generate random UUIDs in the RFC 4122 variant layout. -- Client code should generate random UUIDs in the RFC 4122 variant layout.
-- https://datatracker.ietf.org/doc/html/rfc4122.html -- https://datatracker.ietf.org/doc/html/rfc4122.html
-- This is possible in Python by simply calling uuid.uuid4() with no arguments -- This is possible in Python by simply calling uuid.uuid4() with no arguments
CREATE TABLE store (id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE store (
machine INTEGER, uuid TEXT PRIMARY KEY NOT NULL -- UUID generated by str(uuid.uuid4())
dbpath TEXT NOT NULL,
uuid TEXT NOT NULL, -- UUID generated by str(uuid.uuid4())
imported BOOL, -- is this the store for the current directory? If not, then it was
-- imported in order to describe a dependency.
FOREIGN KEY (machine) REFERENCES machine (id) ON UPDATE CASCADE
); );
-- The filedir table holds all files and directories that are tracked by the -- The filedir table holds all files and directories that are tracked by the
-- store. This table also holds tracked files and directories that have been -- store. This table also holds tracked files and directories that have been
-- imported and live outside the current store. -- imported and live outside the current store.
-- We do not support renaming files. Once an entry is created here, it should -- We do not support renaming files.
-- only be updated to reflect frozen/thawed status. CREATE TABLE filedir (
CREATE TABLE filedir (id INTEGER PRIMARY KEY NOT NULL, uuid TEXT PRIMARY KEY NOT NULL,
store INTEGER NOT NULL, store TEXT NOT NULL,
name TEXT, -- only a filename, not a path name TEXT, -- only a filename, not a path
parent INTEGER REFERENCES filedir ON UPDATE CASCADE, parent TEXT REFERENCES filedir ON UPDATE CASCADE,
frozen BOOL NOT NULL,
UNIQUE(store, name, parent) UNIQUE(store, name, parent)
); );
-- Detect cross-store references -- Detect cross-store references
CREATE TRIGGER insert_filedir BEFORE INSERT ON filedir CREATE TRIGGER insert_filedir BEFORE INSERT ON filedir
BEGIN SELECT CASE BEGIN SELECT CASE
WHEN NEW.parent IS NOT NULL AND NEW.store != (SELECT store FROM filedir WHERE id = NEW.parent) WHEN NEW.parent IS NOT NULL AND NEW.store != (SELECT store FROM filedir WHERE uuid = NEW.parent)
THEN RAISE (ABORT, 'Parent resides in different store') THEN RAISE (ABORT, 'Parent resides in different store')
END; END; END; END;
CREATE TRIGGER update_filedir BEFORE UPDATE ON filedir CREATE TRIGGER update_filedir BEFORE UPDATE ON filedir
BEGIN SELECT CASE BEGIN
WHEN (NEW.id != OLD.id OR NEW.store != OLD.store OR NEW.parent != OLD.parent) SELECT RAISE (ABORT, 'No updates to filedir allowed');
THEN RAISE (ABORT, 'The only updates to filedir allowed are to the frozen column') END;
END; END;
-- This table holds _versions_ of files and directories. This table holds -- This table holds _versions_ of files and directories. This table holds
@ -131,8 +111,9 @@ END; END;
-- independent of filetype (though the computation of content hashes is of -- independent of filetype (though the computation of content hashes is of
-- course dependent on filetype). Each version has a number, and was provided by -- course dependent on filetype). Each version has a number, and was provided by
-- some program (and potentially a datum). -- some program (and potentially a datum).
CREATE TABLE filedir_version (id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE filedir_version (
filedir INTEGER NOT NULL uuid TEXT PRIMARY KEY NOT NULL,
filedir TEXT NOT NULL
REFERENCES filedir ON UPDATE CASCADE, -- parent filedir entry REFERENCES filedir ON UPDATE CASCADE, -- parent filedir entry
recorded_time REAL, -- When was this version recorded? recorded_time REAL, -- When was this version recorded?
@ -144,7 +125,7 @@ CREATE TABLE filedir_version (id INTEGER PRIMARY KEY NOT NULL,
-- We record the permissions on each file, in a way that enables reloading -- We record the permissions on each file, in a way that enables reloading
-- permissions properly when thawing after a freeze operation. -- permissions properly when thawing after a freeze operation.
unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--'
symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target. i.e. this is the "content" of the symlink. symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target. i.e. this is the "content" of the symlink.
@ -158,7 +139,7 @@ CREATE TABLE filedir_version (id INTEGER PRIMARY KEY NOT NULL,
-- can defer by only hashing files and directories at the end of a program. -- can defer by only hashing files and directories at the end of a program.
sha256 TEXT, sha256 TEXT,
source_task INTEGER REFERENCES task (id) ON UPDATE CASCADE source_task TEXT REFERENCES task ON UPDATE CASCADE
); );
-- Disallow UPDATING filedir_version. Instead, new version should be created. -- Disallow UPDATING filedir_version. Instead, new version should be created.
-- One exception is during importing, in which case we can disable the trigger -- One exception is during importing, in which case we can disable the trigger
@ -166,8 +147,7 @@ INSERT INTO triggers VALUES('update_filedir_version', TRUE);
CREATE TRIGGER update_filedir_version BEFORE UPDATE ON filedir_version CREATE TRIGGER update_filedir_version BEFORE UPDATE ON filedir_version
BEGIN SELECT CASE BEGIN SELECT CASE
WHEN (SELECT enabled FROM triggers WHERE name = 'update_filedir_version') WHEN (SELECT enabled FROM triggers WHERE name = 'update_filedir_version')
THEN RAISE (ABORT, THEN RAISE (ABORT, 'Updating filedir versions is prohibited')
'Updating filedir versions is prohibited, other than for id changes')
END; END; END; END;
-- TODO: check for inserting inconsistent version info re. deleted flag/filetype -- TODO: check for inserting inconsistent version info re. deleted flag/filetype
@ -176,17 +156,28 @@ END; END;
-- A computational environment which can execute "programs". Note that the -- A computational environment which can execute "programs". Note that the
-- python executable being used, and environment variables are recorded here. -- python executable being used, and environment variables are recorded here.
-- Other info is available in the parent "machine" table. -- Other info is available in the parent "machine" table.
CREATE TABLE environment (id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE environment (
sha256 TEXT PRIMARY KEY NOT NULL,
envvars_json TEXT, -- json.dumps(dict(os.environ)) envvars_json TEXT, -- json.dumps(dict(os.environ))
python_implementation TEXT, -- platform.python_implementation(): 'cpython' python_implementation TEXT, -- platform.python_implementation(): 'cpython'
python_strversion TEXT, -- sys.version: '3.9.7 (default, Sep 16 2021, 13:09:58) \n[GCC 7.5.0]' python_strversion TEXT, -- sys.version: '3.9.7 (default, Sep 16 2021, 13:09:58) \n[GCC 7.5.0]'
python_hexversion INTEGER, -- sys.hexversion: 50923504 python_hexversion INTEGER, -- sys.hexversion: 50923504
user INTEGER, user TEXT REFERENCES user ON UPDATE CASCADE,
timezone TEXT, -- timezone, for interpreting event times
-- disallow duplicate rows platform_release TEXT, -- platform.release(): '5.15.64'
UNIQUE(envvars_json, python_implementation, python_strversion, python_hexversion, user), freedesktop_os_release TEXT, -- requires python 3.10
-- platform.freedesktop_os_release() as JSON
FOREIGN KEY (user) REFERENCES user (id) ON UPDATE CASCADE -- "{'NAME': 'NixOS', 'ID': 'nixos',
-- 'PRETTY_NAME': 'NixOS 22.05 (Quokka)', 'BUG_REPORT_URL':
-- 'https://github.com/NixOS/nixpkgs/issues', 'BUILD_ID':
-- '22.05.20220902.67e4507', 'DOCUMENTATION_URL':
-- 'https://nixos.org/learn.html', 'HOME_URL': 'https://nixos.org/', 'LOGO':
-- 'nix-snowflake', 'SUPPORT_URL': 'https://nixos.org/community.html',
-- 'VERSION': '22.05 (Quokka)', 'VERSION_CODENAME': 'quokka', 'VERSION_ID':
-- '22.05'}"
win32_ver TEXT, -- platform.win32_ver() as JSON
mac_ver TEXT -- platform.mac_ver() as JSON
); );
@ -195,7 +186,8 @@ CREATE TABLE environment (id INTEGER PRIMARY KEY NOT NULL,
-- python script, multiple "programs" may be created. Also note that historical -- python script, multiple "programs" may be created. Also note that historical
-- programs are automatically imported and merged when possible when loading a -- programs are automatically imported and merged when possible when loading a
-- "datum" from disk. -- "datum" from disk.
CREATE TABLE program (id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE program (
uuid TEXT PRIMARY KEY NOT NULL,
name TEXT, -- name of the program, usually written lowercase by calling code e.g. cnn_crossval name TEXT, -- name of the program, usually written lowercase by calling code e.g. cnn_crossval
-- Names of built-in operations will be shown in upper case: e.g. 'FREEZE' -- Names of built-in operations will be shown in upper case: e.g. 'FREEZE'
@ -205,65 +197,57 @@ CREATE TABLE program (id INTEGER PRIMARY KEY NOT NULL,
end_time REAL, end_time REAL,
process_id INTEGER, -- host PID of python process on host OS process_id INTEGER, -- host PID of python process on host OS
environment INTEGER NOT NULL, environment TEXT NOT NULL REFERENCES environment ON UPDATE CASCADE,
message TEXT, -- user-defined message to help distinguish similar runs message TEXT NOT NULL -- user-defined message to help distinguish similar runs
FOREIGN KEY (environment) REFERENCES environment (id) ON UPDATE CASCADE
); );
-- We try to track all python packages that impact execution by traversing a -- We try to track all python packages that impact execution by traversing a
-- copy of sys.modules. This is done once before a "program" and once after in -- copy of sys.modules. This is done once before a "program" and once after in
-- case some calling code winds up calling a previously-unloaded module. -- case some calling code winds up calling a previously-unloaded module.
CREATE TABLE py_package (id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE py_package (
sha256 TEXT PRIMARY KEY NOT NULL,
name TEXT NOT NULL, name TEXT NOT NULL,
version TEXT version TEXT,
UNIQUE (name, version)
); );
-- A py_module describes any python module file containing decorated Functions. -- A py_module describes any python module file containing decorated Functions.
-- Modules are tracked since they impact the global scope of function calls. -- Modules are tracked since they impact the global scope of function calls.
CREATE TABLE py_module(id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE py_module(
sha256 TEXT PRIMARY KEY NOT NULL,
name TEXT NOT NULL, name TEXT NOT NULL,
code TEXT, code TEXT, -- code doesn't have to be included, but should be used to create sha256
py_package INTEGER, py_package TEXT REFERENCES py_package ON UPDATE CASCADE
FOREIGN KEY (py_package) REFERENCES py_package (id) ON UPDATE CASCADE
); );
-- A py_function just describes a function, without reference to its arguments. -- A py_function just describes a function, without reference to its arguments.
-- It can have inputs and outputs, which are described in the func_inputs and -- It can have inputs and outputs, which are described in the func_inputs and
-- func_outputs children tables. -- func_outputs children tables.
CREATE TABLE py_function(id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE py_function(
sha256 TEXT PRIMARY KEY NOT NULL,
name TEXT NOT NULL, name TEXT NOT NULL,
py_module INTEGER NOT NULL, py_module TEXT NOT NULL REFERENCES py_module ON UPDATE CASCADE
-- we use POSIX timestamps for time recording.
-- e.g. datetime.datetime.now().timestamp()
start_time REAL,
end_time REAL,
FOREIGN KEY (py_module) REFERENCES py_module (id) ON UPDATE CASCADE
); );
CREATE TABLE py_function_input(id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE py_function_input(
uuid TEXT PRIMARY KEY NOT NULL,
name TEXT NOT NULL, name TEXT NOT NULL,
typename TEXT NOT NULL, typename TEXT NOT NULL,
py_function INTEGER NOT NULL, py_function TEXT NOT NULL REFERENCES py_function ON UPDATE CASCADE,
position INTEGER, position INTEGER,
posonly BOOL, posonly BOOL,
kwonly BOOL, kwonly BOOL,
description TEXT, description TEXT
FOREIGN KEY (py_function) REFERENCES py_function (id) ON UPDATE CASCADE
); );
CREATE TABLE py_function_output(id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE py_function_output(
uuid TEXT PRIMARY KEY NOT NULL,
name TEXT, name TEXT,
typename TEXT NOT NULL, typename TEXT NOT NULL,
py_function INTEGER NOT NULL, py_function TEXT NOT NULL REFERENCES py_function ON UPDATE CASCADE,
position INTEGER NOT NULL, position INTEGER NOT NULL,
description TEXT, description TEXT
FOREIGN KEY (py_function) REFERENCES py_function (id) ON UPDATE CASCADE
); );
@ -271,47 +255,42 @@ CREATE TABLE py_function_output(id INTEGER PRIMARY KEY NOT NULL,
-- particular set of inputs which are themselves data (see datum table). A task -- particular set of inputs which are themselves data (see datum table). A task
-- is executed in the context of a "program". Within a program, tasks are -- is executed in the context of a "program". Within a program, tasks are
-- typically evaluated in a serial manner. -- typically evaluated in a serial manner.
CREATE TABLE task(id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE task(
program INTEGER NOT NULL, uuid TEXT PRIMARY KEY NOT NULL,
program TEXT NOT NULL REFERENCES program ON UPDATE CASCADE,
py_function INTEGER, -- func is NULL for some built-in functionality like "record" programs -- py_function is NULL for some built-in functionality like "RECORD" programs
py_function TEXT REFERENCES py_function ON UPDATE CASCADE
FOREIGN KEY (py_function) REFERENCES py_function (id) ON UPDATE CASCADE,
FOREIGN KEY (program) REFERENCES program (id) ON UPDATE CASCADE
); );
-- A datum is an object that is computed as the output of a task, given as a -- A datum is an object that is computed as the output of a task, given as a
-- literal value in a config file, or loaded from a file. -- literal value in a config file, or loaded from a file.
CREATE TABLE datum(id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE datum(
-- provider_type describes where the datum came from. Choices are: uuid TEXT PRIMARY KEY NOT NULL,
-- - COMPUTED: output of a decorated Function -- provider_type describes where the datum came from. Choices are:
-- - IMPORTED: imported output from a prior program -- - COMPUTED: output of a decorated Function
-- - LITERAL: literal value provided in a config file -- - IMPORTED: imported output from a prior program
-- Note that literals are implicitly imported (hash computed) at runtime -- - LITERAL: literal value provided in a config file
-- Note that literals are implicitly imported (hash computed) at runtime
provider_type TEXT, provider_type TEXT,
-- task and task_output are only used when provider_type='COMPUTED' -- task and task_output are only used when provider_type='COMPUTED'
task INTEGER, task TEXT REFERENCES task ON UPDATE CASCADE,
task_output INTEGER, task_output TEXT REFERENCES func_output ON UPDATE CASCADE,
literal_json TEXT, -- if provider_type='LITERAL', a JSON representation of the value literal_json TEXT, -- if provider_type='LITERAL', a JSON representation of the value
typename TEXT NOT NULL, -- string representation of the data type typename TEXT NOT NULL -- string representation of the data type
FOREIGN KEY (task) REFERENCES task (id) ON UPDATE CASCADE,
FOREIGN KEY (task_output) REFERENCES func_output (id) ON UPDATE CASCADE
); );
-- A task_input records the version of a Datum that is passed to a function -- A task_input records the version of a Datum that is passed to a function
CREATE TABLE task_input(id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE task_input(
task INTEGER NOT NULL, uuid TEXT PRIMARY KEY NOT NULL,
py_function_input INTEGER, -- if this was a python function, reference which input task TEXT NOT NULL REFERENCES task ON UPDATE CASCADE,
-- if this was a python function, reference which input
py_function_input TEXT REFERENCES py_function_input ON UPDATE CASCADE,
datum INTEGER NOT NULL, datum TEXT NOT NULL REFERENCES datum ON UPDATE CASCADE,
-- Data have versions to facilitate tracking non-const operations. If a datum -- Data have versions to facilitate tracking non-const operations. If a datum
-- is passed to a non-const operation, it must increment its internal -- is passed to a non-const operation, it must increment its internal
-- version -- version
datum_version INTEGER NOT NULL, datum_version INTEGER NOT NULL
FOREIGN KEY (task) REFERENCES task (id) ON UPDATE CASCADE,
FOREIGN KEY (py_function_input) REFERENCES py_function_input (id) ON UPDATE CASCADE,
FOREIGN KEY (datum) REFERENCES datum (id) ON UPDATE CASCADE
); );

View File

@ -4,16 +4,18 @@ from loguru import logger
from . import db, environment, fs from . import db, environment, fs
from contextlib import contextmanager
from dataclasses import dataclass from dataclasses import dataclass
import datetime import datetime
import os import os
from pathlib import Path from pathlib import Path
import sqlite3 import sqlite3
from typing import Any, Optional, TypeVar, Type, Union from typing import Any, Iterator, Optional, TypeVar, Type, Union
import uuid
import warnings import warnings
@dataclass @dataclass(slots=True)
class Program: class Program:
store: "Store" store: "Store"
name: str name: str
@ -23,40 +25,45 @@ class Program:
start_time: Optional[datetime.datetime] = None start_time: Optional[datetime.datetime] = None
evaluated: bool = False evaluated: bool = False
uuid: str = ""
def __post_init__(self) -> None:
self.uuid = str(uuid.uuid4())
def __enter__(self) -> "Program": def __enter__(self) -> "Program":
if self.evaluated: if self.evaluated:
raise RuntimeError("Cannot re-enter a Program context") raise RuntimeError("Cannot re-enter a Program context")
assert self.store.conn is not None env = environment.Environment.detect()
cur = self.store.conn.cursor() with self.store.committing() as cur:
env = environment.Environment.find_or_insert(cur) env.maybe_insert(cur)
cur.execute(
cur.execute( "INSERT INTO program VALUES (?, ?, ?, ?, ?, ?, ?)",
"INSERT INTO program VALUES (?, ?, ?, ?, ?, ?, ?)", (
( self.uuid, # uuid
None, # id INTEGER PRIMARY KEY NOT NULL, self.name, # name TEXT,
self.name, # name TEXT, # name of the program, usually written lowercase by calling
# name of the program, usually written lowercase by calling # code e.g. cnn_crossval
# code e.g. cnn_crossval # -- we use POSIX timestamps for time recording.
# -- we use POSIX timestamps for time recording. # -- e.g. datetime.datetime.now().timestamp()
# -- e.g. datetime.datetime.now().timestamp() None, # start_time REAL,
None, # start_time REAL, None, # end_time REAL,
None, # end_time REAL, os.getpid(), # process_id INTEGER, -- host PID of python process on host OS
os.getpid(), # process_id INTEGER, -- host PID of python process on host OS env.sha256.hex(), # environment INTEGER NOT NULL,
env.id, # environment INTEGER NOT NULL, self.message, # user-defined message to help distinguish similar runs
self.message, # user-defined message to help distinguish similar runs ),
), )
) self.id = cur.lastrowid
self.id = cur.lastrowid
self.start_time = datetime.datetime.now() self.start_time = datetime.datetime.now()
return self return self
def new_task(self, name: str, py_function_id: Optional[int] = None) -> int: def new_task(
self,
name: str,
cur: sqlite3.Cursor,
py_function_id: Optional[int] = None,
) -> int:
"""Create a new task and return its id""" """Create a new task and return its id"""
assert self.store.conn is not None
cur = self.store.conn.cursor()
cur.execute( cur.execute(
"INSERT INTO task VALUES (?, ?, ?)", "INSERT INTO task VALUES (?, ?, ?)",
(None, self.id, py_function_id), (None, self.id, py_function_id),
@ -75,21 +82,20 @@ class Program:
# record start and end times in store # record start and end times in store
assert self.store.conn is not None assert self.store.conn is not None
cur = self.store.conn.cursor() with self.store.committing() as cur:
cur.execute( cur.execute(
""" """
UPDATE UPDATE
program program
SET SET
start_time = ?, start_time = ?,
end_time = ? end_time = ?
WHERE WHERE
id = ? uuid = ?
""", """,
(self.start_time, end_time, self.id), (self.start_time, end_time, self.uuid),
) )
cur.connection.commit() self.evaluated = True # prevent re-running
self._evaluated = True # prevent re-running
assert self.start_time is not None assert self.start_time is not None
elapsed = end_time - self.start_time elapsed = end_time - self.start_time
logger.success( logger.success(
@ -126,11 +132,14 @@ class Store:
else: else:
self.path = Path(directory) self.path = Path(directory)
self.db_path = self.path / "nancy.db" self.db_path = self.path / "nancy.db"
if conn is None: if conn is None:
self.connect() self.connect()
else: else:
self.conn = conn self.conn = conn
self.store_uuid = self.find_store_uuid()
def copy(self: _StoreT, store_path: fs.PathStr) -> _StoreT: def copy(self: _StoreT, store_path: fs.PathStr) -> _StoreT:
"""Copy this store to a new store path""" """Copy this store to a new store path"""
assert self.conn is not None assert self.conn is not None
@ -141,10 +150,28 @@ class Store:
return self.__class__(store_path) return self.__class__(store_path)
def connect(self) -> sqlite3.Connection: def connect(self) -> sqlite3.Connection:
self.conn = sqlite3.connect(self.db_path) self.conn = sqlite3.connect(self.db_path, isolation_level="DEFERRED")
self.conn.cursor().execute("PRAGMA foreign_keys = ON;") self.conn.cursor().execute("PRAGMA foreign_keys = ON;")
return self.conn return self.conn
@contextmanager
def committing(self) -> Iterator[sqlite3.Cursor]:
"""Return a context manager that gives a cursor and commits on exit."""
assert self.conn is not None
cur = self.conn.cursor()
yield cur
self.conn.commit()
def find_store_uuid(self, cur: Optional[sqlite3.Cursor] = None) -> Optional[str]:
assert self.conn is not None
if cur is None:
cur = self.conn.cursor()
cur.execute(
'SELECT value FROM local_metadata WHERE key == "store_uuid" LIMIT 1'
)
res = cur.fetchone()
return None if res is None else res[0]
@classmethod @classmethod
def init( def init(
cls: Type[_StoreT], message: str, directory: Optional[fs.PathStr] = None cls: Type[_StoreT], message: str, directory: Optional[fs.PathStr] = None
@ -163,14 +190,27 @@ class Store:
f"File {db_path} exists. Refusing to re-initialize", f"File {db_path} exists. Refusing to re-initialize",
) )
# initialize a database in the target directory # initialize a database in the target directory
conn = sqlite3.connect(db_path) conn = sqlite3.connect(db_path, isolation_level="DEFERRED")
db.init_schema(conn.cursor()) cur = conn.cursor()
db.init_schema(cur)
conn.commit()
cur.close()
new_store = cls(directory, conn) new_store = cls(directory, conn)
with new_store.program("INIT", message) as p: with new_store.program("INIT", message) as p:
# set the timing to the actual times it took to initialize the db # set the timing to the actual times it took to initialize the db
p.start_time = start_time p.start_time = start_time
# generate a new UUID for this store
assert new_store.store_uuid is None
new_store.store_uuid = str(uuid.uuid4())
with new_store.committing() as cur:
cur.execute(
'INSERT INTO local_metadata VALUES ("store_uuid", ?)',
(new_store.store_uuid,),
)
return new_store return new_store
def make_readonly(self) -> None: def make_readonly(self) -> None:
@ -345,24 +385,13 @@ class Store:
cur = self.conn.cursor() cur = self.conn.cursor()
with self.program("RECORD", message) as p: with self.program("RECORD", message) as p:
# create a task for this operation with self.committing() as cur: # entire record operation is one transaction
task_id = p.new_task("Store._record_recursive") # create a task for this operation
task_id = p.new_task("Store._record_recursive", cur=cur)
# descend the diff, tracking parent filedir IDs, creating them and # descend the diff, tracking parent filedir IDs, creating them and
# recording new versions of each, when necessary # recording new versions of each, when necessary
self._record_recursive(diff, cur, source_task=task_id) self._record_recursive(diff, cur, source_task=task_id)
class StoreFile:
"""Describes a file that is recorded in the store."""
def __init__(self, store: Store, rel_path: fs.PathStr):
self.store = store
self.rel_path = rel_path
def save(self) -> None:
# call the appropriate save method
pass
def find_store(path: Union[str, "os.PathLike[str]"]) -> Optional[str]: def find_store(path: Union[str, "os.PathLike[str]"]) -> Optional[str]:

View File

@ -1,76 +1,59 @@
from . import machine from . import machine
from dataclasses import dataclass
import hashlib
import getpass import getpass
import os import os
import pwd import pwd
import sqlite3 import sqlite3
from typing import NamedTuple, Optional, Type, TypeVar from typing import Callable, NamedTuple, Optional, Type, TypeVar
# see https://stackoverflow.com/questions/44640479/type-annotation-for-classmethod-returning-instance # see https://stackoverflow.com/questions/44640479/type-annotation-for-classmethod-returning-instance
_UserT = TypeVar("_UserT", bound="User") _UserT = TypeVar("_UserT", bound="User")
class User(NamedTuple): @dataclass(frozen=True, slots=True)
id: Optional[int] # if not None, this is `id` in the `machine` table class User:
username: str username: str
userid: int userid: int
fullname: str fullname: str
machine: machine.Machine machine: machine.Machine
sha256: bytes = b""
@classmethod def __post_init__(self) -> None:
def find_or_insert( m = hashlib.sha256()
cls: Type[_UserT], cur: sqlite3.Cursor, user: Optional[_UserT] = None upstr: Callable[[str], None] = lambda s: m.update(bytes(s, "utf-8"))
) -> _UserT: upint: Callable[[int], None] = lambda n: m.update(
"""Given a DB cursor, find or create row in user table and fill""" n.to_bytes((n.bit_length() + 7) // 8, byteorder="big"),
if user is None: )
user = cls.detect() upstr(self.username)
upint(self.userid)
upstr(self.fullname)
m.update(self.machine.sha256)
# circumvent freezing to set sha256
object.__setattr__(self, "sha256", m.digest())
m = machine.Machine.find_or_insert(cur) def maybe_insert(self, cur: sqlite3.Cursor) -> str:
"""Insert if not exists, without committing."""
self.machine.maybe_insert(cur)
user = user._replace(machine=m) key = self.sha256.hex()
# insert or ignore, handle each case to set id
cur.execute( cur.execute(
""" """
SELECT INSERT OR IGNORE INTO user VALUES (?,?,?,?,?);
id
FROM
user
WHERE
username = ? AND
userid = ? AND
fullname = ? AND
machine = ?
LIMIT 1
""", """,
( (
user.username, key,
user.userid, self.username,
user.fullname, self.userid,
user.machine.id, self.fullname,
self.machine.sha256.hex(),
), ),
) )
res = cur.fetchone()
if res is None:
cur.execute(
"""
INSERT INTO user VALUES (?,?,?,?,?);
""",
(
user.id,
user.username,
user.userid,
user.fullname,
user.machine.id,
),
)
id = cur.lastrowid
cur.connection.commit()
else:
id = res[0]
return user._replace(id=id) return key
@classmethod @classmethod
def detect(cls: Type[_UserT]) -> _UserT: def detect(cls: Type[_UserT]) -> _UserT:
@ -81,12 +64,9 @@ class User(NamedTuple):
# TODO: will this fail on Windows/OSX? # TODO: will this fail on Windows/OSX?
fullname = pwd.getpwuid(os.getuid()).pw_gecos fullname = pwd.getpwuid(os.getuid()).pw_gecos
m = machine.Machine.detect()
return cls( return cls(
None,
getpass.getuser(), getpass.getuser(),
os.getuid(), os.getuid(),
fullname, fullname,
m, machine=machine.Machine.detect(),
) )

View File

@ -1,3 +1,3 @@
from .db import schema_version # NOQA from .db import schema_version as schema_version # NOQA
__version__ = "0.1.0" __version__ = "0.1.0"

View File

@ -6,10 +6,11 @@ import pytest
import datetime import datetime
import os import os
import sqlite3 import sqlite3
from typing import Iterator
@pytest.fixture @pytest.fixture
def temp_db(): def temp_db() -> Iterator[sqlite3.Cursor]:
"""Create an in-memory database that follow's the nancy schema""" """Create an in-memory database that follow's the nancy schema"""
with sqlite3.connect(":memory:") as conn: with sqlite3.connect(":memory:") as conn:
cur = conn.cursor() cur = conn.cursor()
@ -18,47 +19,42 @@ def temp_db():
db.init_schema(cur) db.init_schema(cur)
cur.execute(
'INSERT INTO local_metadata VALUES ("store_uuid", ?)',
("78dc0b93-0e22-45ee-ae90-49d3575dd70f",),
)
yield cur yield cur
@pytest.fixture @pytest.fixture
def insert_machine(temp_db): def insert_machine(temp_db: sqlite3.Cursor) -> sqlite3.Cursor:
cur = temp_db cur = temp_db
cur.executemany( cur.executemany(
"INSERT INTO machine VALUES " "(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", "INSERT INTO machine VALUES " "(?, ?, ?, ?, ?, ?)",
[ [
( (
None, # id INTEGER PRIMARY KEY NOT NULL, "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", # sha256
"a5d97c08a15c4db69f5fded523a1bfe3", # machine_id TEXT, -- platform-dependent unique hardware id "a5d97c08a15c4db69f5fded523a1bfe3", # machine_id TEXT
"lucky", # hostname TEXT, -- platform.node(): 'lucky' "lucky", # hostname TEXT, -- platform.node(): 'lucky'
"", # processor TEXT, -- platform.processor():
"Linux", # system TEXT, -- platform.system(): 'Linux' "Linux", # system TEXT, -- platform.system(): 'Linux'
"5.15.64", # release TEXT, -- platform.release(): '5.15.64' "x86_64", # cpu_type TEXT, -- platform.machine():
"aarch64", # machine TEXT, -- platform.machine(): 'x86_64' "i386", # processor TEXT, -- platform.processor():
"EDT", # timezone TEXT, -- timezone, for interpreting event times
"", # freedesktop_os_release TEXT, -- requires python 3.10
"", # win32_ver TEXT, -- platform.win32_ver() as JSON
"", # mac_ver TEXT -- platform.mac_ver() as JSON
), ),
( (
None, # id INTEGER PRIMARY KEY NOT NULL, "2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae", # sha256
"afc9b06a23b74341b29d42b8312a4f8a", "b5d97c08a15c4db69f5fded523a1bfe3", # machine_id TEXT
"a100", # hostname TEXT, -- platform.node(): 'lucky' "a100", # hostname TEXT, -- platform.node(): 'lucky'
"", # processor TEXT, -- platform.processor():
"Linux", # system TEXT, -- platform.system(): 'Linux' "Linux", # system TEXT, -- platform.system(): 'Linux'
"5.15.63", # release TEXT, -- platform.release(): '5.15.64' "x86_64", # cpu_type TEXT, -- platform.machine():
"x86_64", # machine TEXT, -- platform.machine(): 'x86_64' "i386", # processor TEXT, -- platform.processor():
"EST", # timezone TEXT, -- timezone, for interpreting event times
"", # freedesktop_os_release TEXT, -- requires python 3.10
"", # win32_ver TEXT, -- platform.win32_ver() as JSON
"", # mac_ver TEXT -- platform.mac_ver() as JSON
), ),
], ],
) )
return cur return cur
def test_insert_machine(insert_machine): def test_insert_machine(insert_machine: sqlite3.Cursor) -> None:
cur = insert_machine cur = insert_machine
cur.execute("SELECT * FROM machine") cur.execute("SELECT * FROM machine")
machines = cur.fetchall() machines = cur.fetchall()
@ -66,45 +62,45 @@ def test_insert_machine(insert_machine):
@pytest.fixture @pytest.fixture
def insert_user(insert_machine): def insert_user(insert_machine: sqlite3.Cursor) -> sqlite3.Cursor:
cur = insert_machine cur = insert_machine
cur.executemany( cur.executemany(
"INSERT INTO user VALUES " "(?, ?, ?, ?, ?)", "INSERT INTO user VALUES (?, ?, ?, ?, ?)",
[ [
( ( # jacob@lucky
None, # id INTEGER PRIMARY KEY NOT NULL, "f2ee9db3526d36b4346980f12a49cb404b924cf784dbd3d5db8a362e363a8070", # sha256
"jacob", # username TEXT NOT NULL, "jacob", # username TEXT NOT NULL,
101, # userid INTEGER, 101, # userid INTEGER,
"Jacob Hinkle", # fullname TEXT, "Jacob Hinkle", # fullname TEXT,
1, # machine INTEGER NOT NULL, "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", # machine
), ),
( ( # jacob@a100
None, # id INTEGER PRIMARY KEY NOT NULL, "3219c598f4e6e84bea3d0c295f0d2c35f3ce01747f4839fb698b32646b037d3c", # sha256
"jacob", # username TEXT NOT NULL, "jacob", # username TEXT NOT NULL,
10301, # userid INTEGER, 10301, # userid INTEGER,
"Jacob Hinkle", # fullname TEXT, "Jacob Hinkle", # fullname TEXT,
2, # machine INTEGER NOT NULL, "2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae", # machine
), ),
( ( # bob@a100
None, # id INTEGER PRIMARY KEY NOT NULL, "99389acaef44ed99c16bcaf1bff0c261b35ad110f0544092f711a1354e616f61", # sha256
"bob", # username TEXT NOT NULL, "bob", # username TEXT NOT NULL,
2035, # userid INTEGER, 2035, # userid INTEGER,
"Just Bob", # fullname TEXT, "Just Bob", # fullname TEXT,
2, # machine INTEGER NOT NULL, "2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae", # machine
), ),
], ],
) )
return cur return cur
def test_insert_user(insert_user): def test_insert_user(insert_user: sqlite3.Cursor) -> None:
cur = insert_user cur = insert_user
cur.execute("SELECT * FROM user") cur.execute("SELECT * FROM user")
users = cur.fetchall() users = cur.fetchall()
assert len(users) == 3 assert len(users) == 3
def test_invalid_user_machine(insert_user): def test_invalid_user_machine(insert_user: sqlite3.Cursor) -> None:
cur = insert_user cur = insert_user
with pytest.raises(sqlite3.IntegrityError): with pytest.raises(sqlite3.IntegrityError):
# should fail foreign key constraint # should fail foreign key constraint
@ -133,66 +129,44 @@ def test_invalid_user_machine(insert_user):
@pytest.fixture @pytest.fixture
def insert_store(insert_machine): def insert_store(insert_machine: sqlite3.Cursor) -> sqlite3.Cursor:
import uuid import uuid
cur = insert_machine cur = insert_machine
cur.executemany( cur.executemany(
"INSERT INTO store VALUES " "(?, ?, ?, ?, ?)", "INSERT INTO store VALUES " "(?)",
[ [
( ("63c2a84a-524f-46ad-b512-b99e80f7385b",),
None, # id INTEGER PRIMARY KEY NOT NULL, ("3be35ff6-6a7b-49cf-ba3e-c610b781b54e",),
1, # machine INTEGER, ("b8f685d8-2322-403d-a59c-fb8301229788",),
"/path/to/first/store", # dbpath TEXT NOT NULL,
str(uuid.uuid4()), # -- UUID generated by str(uuid.uuid4())
False, # imported BOOL,
),
(
None, # id INTEGER PRIMARY KEY NOT NULL,
1, # machine INTEGER,
"/path/to/dependency/store", # dbpath TEXT NOT NULL,
str(uuid.uuid4()), # -- UUID generated by str(uuid.uuid4())
True, # imported BOOL,
),
(
None, # id INTEGER PRIMARY KEY NOT NULL,
2, # machine INTEGER,
# same path but on a separate machine
"/path/to/first/store", # dbpath TEXT NOT NULL,
str(uuid.uuid4()), # -- UUID generated by str(uuid.uuid4())
True, # imported BOOL,
),
], ],
) )
return cur return cur
@pytest.fixture @pytest.fixture
def insert_directories(insert_store): def insert_directories(insert_store: sqlite3.Cursor) -> sqlite3.Cursor:
cur = insert_store cur = insert_store
cur.executemany( cur.executemany(
"INSERT INTO filedir VALUES " "(?, ?, ?, ?, ?)", "INSERT INTO filedir VALUES (?, ?, ?, ?)",
[ [
( (
None, # id INTEGER PRIMARY KEY NOT NULL, "71e75275-0847-4e47-9df5-1f2bbe01da91", # uuid
1, # store INTEGER NOT NULL, "63c2a84a-524f-46ad-b512-b99e80f7385b", # store
".", # filename TEXT, -- only a filename, not a path ".", # filename
None, # parent INTEGER REFERENCES filedir ON UPDATE CASCADE, None, # parent
False, # frozen BOOL NOT NULL,
), ),
( (
None, # id INTEGER PRIMARY KEY NOT NULL, "6051a251-d38a-4d8c-ba57-ac74023ec2f4", # uuid
1, # store INTEGER NOT NULL, "63c2a84a-524f-46ad-b512-b99e80f7385b", # store
"foo", # filename TEXT, -- only a filename, not a path "foo", # filename
1, # parent INTEGER REFERENCES filedir ON UPDATE CASCADE, "71e75275-0847-4e47-9df5-1f2bbe01da91", # parent
False, # frozen BOOL NOT NULL,
), ),
( (
None, # id INTEGER PRIMARY KEY NOT NULL, "5f74b5f2-eac8-4a86-90f6-4f7c7981564d", # uuid
2, # store INTEGER NOT NULL, "3be35ff6-6a7b-49cf-ba3e-c610b781b54e", # store
".", # filename TEXT, -- only a filename, not a path ".", # filename
None, # parent INTEGER REFERENCES filedir ON UPDATE CASCADE, None, # parent
False, # frozen BOOL NOT NULL,
), ),
], ],
) )
@ -200,34 +174,34 @@ def insert_directories(insert_store):
"INSERT INTO filedir_version VALUES " "(?, ?, ?, ?, ?, ?, ?, ?, ?)", "INSERT INTO filedir_version VALUES " "(?, ?, ?, ?, ?, ?, ?, ?, ?)",
[ [
( (
None, # id INTEGER PRIMARY KEY NOT NULL, "baad81ef-7cc6-48df-973c-e9f32ceda19a", # uuid
1, # INTEGER REFERENCES filedir ON UPDATE CASCADE, -- parent filedir entry "71e75275-0847-4e47-9df5-1f2bbe01da91", # filedir
datetime.datetime.now().timestamp(), datetime.datetime.now().timestamp(),
"DIR", # filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details "DIR", # filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details
False, # deleted BOOL NOT NULL, -- set True when recording a deleted file False, # deleted BOOL NOT NULL, -- set True when recording a deleted file
"drwxrwxr-x", # unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' "drwxrwxr-x", # perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--'
None, # symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target. I.e. this is the "content" of the symlink. None, # symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target. I.e. this is the "content" of the symlink.
"a84ed33864d06615a87bc8da5258d841163f1e7969367ecd07b041ae1a18febd", # sha256 TEXT, "a84ed33864d06615a87bc8da5258d841163f1e7969367ecd07b041ae1a18febd", # sha256 TEXT,
None, # source_task INTEGER, None, # source_task INTEGER,
), ),
( (
None, # id INTEGER PRIMARY KEY NOT NULL, "c4037a74-88cd-4a1d-b158-4ebebb25f10d", # uuid
1, # INTEGER REFERENCES filedir ON UPDATE CASCADE, -- parent filedir entry "71e75275-0847-4e47-9df5-1f2bbe01da91", # filedir
datetime.datetime.now().timestamp(), datetime.datetime.now().timestamp(),
"DIR", # filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details "DIR", # filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details
False, # deleted BOOL NOT NULL, -- set True when recording a deleted file False, # deleted BOOL NOT NULL, -- set True when recording a deleted file
"drwxrwxr-x", # unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' "drwxrwxr-x", # perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--'
None, # symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target. I.e. this is the "content" of the symlink. None, # symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target. I.e. this is the "content" of the symlink.
"a84ed33864d06615a87bc8da5258d841163f1e7969367ecd07b041ae1a18febd", # sha256 TEXT, "a84ed33864d06615a87bc8da5258d841163f1e7969367ecd07b041ae1a18febd", # sha256 TEXT,
None, # source_task INTEGER, None, # source_task INTEGER,
), ),
( (
None, # id INTEGER PRIMARY KEY NOT NULL, "dcfbe086-3c4b-43b8-a550-974610b86267", # uuid
1, # INTEGER REFERENCES filedir ON UPDATE CASCADE, -- parent filedir entry "5f74b5f2-eac8-4a86-90f6-4f7c7981564d", # filedir
datetime.datetime.now().timestamp(), datetime.datetime.now().timestamp(),
"DIR", # filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details "DIR", # filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details
False, # deleted BOOL NOT NULL, -- set True when recording a deleted file False, # deleted BOOL NOT NULL, -- set True when recording a deleted file
"drwxrwxr-x", # unfrozen_perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' "drwxrwxr-x", # perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--'
None, # symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target. I.e. this is the "content" of the symlink. None, # symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target. I.e. this is the "content" of the symlink.
"a84ed33864d06615a87bc8da5258d841163f1e7969367ecd07b041ae1a18febd", # sha256 TEXT, "a84ed33864d06615a87bc8da5258d841163f1e7969367ecd07b041ae1a18febd", # sha256 TEXT,
None, # source_task INTEGER, None, # source_task INTEGER,
@ -237,18 +211,17 @@ def insert_directories(insert_store):
return cur return cur
def test_crossstore_directory_insert(insert_directories): def test_crossstore_directory_insert(insert_directories: sqlite3.Cursor) -> None:
cur = insert_directories cur = insert_directories
with pytest.raises(sqlite3.IntegrityError): with pytest.raises(sqlite3.IntegrityError):
# declaring directory as belonging to store 2, but parent's store is 1 # declaring directory as belonging to store 2, but parent's store is 1
cur.execute( cur.execute(
"INSERT INTO filedir VALUES " "(?, ?, ?, ?, ?)", "INSERT INTO filedir VALUES (?, ?, ?, ?)",
( (
None, # id INTEGER PRIMARY KEY NOT NULL, "b9774e78-2646-476e-b4a8-f6df0b10ba8e", # uuid
2, # store INTEGER NOT NULL, "3be35ff6-6a7b-49cf-ba3e-c610b781b54e", # store
"some_dir", # filename TEXT, -- only a filename, not a path "some_dir", # filename
1, # parent INTEGER REFERENCES filedir ON UPDATE CASCADE, "baad81ef-7cc6-48df-973c-e9f32ceda19a", # parent
False, # frozen BOOL NOT NULL,
), ),
) )
for row in cur.connection.iterdump(): for row in cur.connection.iterdump():
@ -258,7 +231,7 @@ def test_crossstore_directory_insert(insert_directories):
@pytest.fixture @pytest.fixture
def insert_files(insert_directories): def insert_files(insert_directories: sqlite3.Cursor) -> sqlite3.Cursor:
cur = insert_directories cur = insert_directories
cur.execute("SELECT COUNT(*) FROM filedir") cur.execute("SELECT COUNT(*) FROM filedir")
(nprev,) = cur.fetchone() (nprev,) = cur.fetchone()
@ -328,7 +301,7 @@ def insert_files(insert_directories):
# TODO: This test is disabled until triggers are added to check for these types # TODO: This test is disabled until triggers are added to check for these types
# of constraints. These became much more complicated to check when I added # of constraints. These became much more complicated to check when I added
# filedir_version. # filedir_version.
def disabled_test_nondir_parent_directory_insert(insert_files): def disabled_test_nondir_parent_directory_insert(insert_files: sqlite3.Cursor) -> None:
cur = insert_files cur = insert_files
with pytest.raises(sqlite3.IntegrityError): with pytest.raises(sqlite3.IntegrityError):
# declaring parent as 5, but 5 is a file (plots.png) # declaring parent as 5, but 5 is a file (plots.png)

View File

@ -1,19 +1,23 @@
"""Test functionality of the Store class."""
from nancy import store
import os import os
from pathlib import Path from pathlib import Path
import pytest import pytest
import sys import sys
import tempfile import tempfile
from typing import Iterator
@pytest.fixture @pytest.fixture
def bare_dir(): def bare_dir() -> Iterator[Path]:
"""Create an emptry temp directory""" """Create an emptry temp directory"""
with tempfile.TemporaryDirectory(prefix="nancy_testdir") as d: with tempfile.TemporaryDirectory(prefix="nancy_testdir") as d:
yield Path(d) yield Path(d)
@pytest.fixture @pytest.fixture
def filled_dir(bare_dir): def filled_dir(bare_dir: Path) -> Path:
open(bare_dir / "a.txt", "w").write("foo") open(bare_dir / "a.txt", "w").write("foo")
os.makedirs(bare_dir / "stats") os.makedirs(bare_dir / "stats")
open(bare_dir / "stats" / "metrics.csv", "w").write("bar,baz") open(bare_dir / "stats" / "metrics.csv", "w").write("bar,baz")
@ -22,24 +26,23 @@ def filled_dir(bare_dir):
return bare_dir return bare_dir
def test_record_untracked_dir(filled_dir): def test_record_untracked_dir(filled_dir: Path) -> None:
from nancy.cli.record import record from nancy.cli.record import record
record(filled_dir, message="test_record_untracked_dir") record(store_path=filled_dir, message="test_record_untracked_dir")
@pytest.fixture @pytest.fixture
def store(): def empty_store() -> Iterator[store.Store]:
from nancy import store
s = store.Store.init(message="test init") s = store.Store.init(message="test init")
yield s yield s
def test_schema_version_match(store): def test_schema_version_match(empty_store: store.Store) -> None:
from nancy.version import schema_version from nancy.version import schema_version
cur = store.conn.cursor() assert empty_store.conn is not None
cur = empty_store.conn.cursor()
(db_schema_ver,) = cur.execute("PRAGMA user_version;").fetchone() (db_schema_ver,) = cur.execute("PRAGMA user_version;").fetchone()
assert schema_version == db_schema_ver assert schema_version == db_schema_ver