diff --git a/src/nancy/cli/__init__.py b/src/nancy/cli/__init__.py index 0308404..3270a06 100644 --- a/src/nancy/cli/__init__.py +++ b/src/nancy/cli/__init__.py @@ -2,7 +2,8 @@ import click from ..version import __version__ -from .freeze import freeze, thaw +#from .freeze import freeze, thaw +from .record import record # from https://click.palletsprojects.com/en/5.x/advanced/ @@ -33,6 +34,7 @@ def main(): """ pass -main.add_command(freeze) -main.add_command(thaw) +#main.add_command(freeze) +#main.add_command(thaw) +main.add_command(record) main.add_command(version) diff --git a/src/nancy/cli/record.py b/src/nancy/cli/record.py new file mode 100644 index 0000000..6b82d33 --- /dev/null +++ b/src/nancy/cli/record.py @@ -0,0 +1,29 @@ +import click + +from .. import store + +import os + + +@click.command() +@click.argument("directory", default='.') +def record(directory): + """ + Initialize tracking or record changes to a tracked directory. + + If DIRECTORY is not already part of an existing nancy store, then a new + 'nancy.db' file is created in that directory. On the other hand, if the + directory is part of an existing store, it will be updated and versions + of any files changes since the last recording will be incremented. + """ + if not os.path.isdir(directory): + raise ValueError(f"Cannot record non-existent directory {directory}") + + existing_store = store.find_store(directory) + if existing_store is None: # this is a new store + s = store.Store.init(directory) + else: # this is an existing store + s = store.Store(directory) + + s.record() + diff --git a/src/nancy/db.py b/src/nancy/db.py index 6ad0c48..6aa0906 100644 --- a/src/nancy/db.py +++ b/src/nancy/db.py @@ -3,10 +3,22 @@ import os import sqlite3 -# This matches the recorded user_version in any nancy.db initialized in this -# process. +# This will match the user_version in any nancy.db initialized by this process schema_version = 0 +# SQLite features we rely on: +# UPSERT (3.24.0, 2018-06-04) +min_sqlite_version = (3, 24, 0) + +sqlite_version = sqlite3.sqlite_version_info +sqlite_verstr = '.'.join(str(v) for v in sqlite_version) +min_sqlite_verstr = '.'.join(str(v) for v in min_sqlite_version) +(major, sub, minor) = sqlite_version +if major < min_sqlite_version[0] or \ + sub < min_sqlite_version[1] or \ + minor < min_sqlite_version[2]: + warning.warn(f"Minimum sqlite version is {min_sqlite_verstr}. Found {sqlite_verstr}") + def init_schema(cur): """Initialize a database following the current schema.""" diff --git a/src/nancy/environment.py b/src/nancy/environment.py new file mode 100644 index 0000000..bb8dec9 --- /dev/null +++ b/src/nancy/environment.py @@ -0,0 +1,73 @@ +from . import user + +from typing import NamedTuple +import json +import os +import platform +import sys +import time + +class Environment(NamedTuple): + id: int + envvars_json: str + python_implementation: str + python_strversion: str + python_hexversion: int + user: user.User + + @classmethod + def find_or_insert(cls, cur, env=None): + """Given a DB cursor, find or create row in environment table and fill""" + if env is None: + env = cls.detect() + + u = user.User.find_or_insert(cur) + + env = env._replace(user=u.id) + + # insert or ignore, handle each case to set id + cur.execute(''' + SELECT + id + FROM + environment + WHERE + envvars_json = ? AND + python_implementation = ? AND + python_strversion = ? AND + python_hexversion = ? AND + user = ? + LIMIT 1 + ''', + env[1:], + ) + res = cur.fetchone() + if res is None: + cur.execute(''' + INSERT INTO environment VALUES (?,?,?,?,?,?); + ''', + env, + ) + id = cur.lastrowid + cur.connection.commit() + else: + id = res[0] + + return env._replace(id=id) + + @classmethod + def detect(cls): + """Detect values for environment independent of the database. + + Note that the user entry will not have a valid id. + """ + u = user.User.detect() + + return cls( + None, + json.dumps(dict(os.environ)), + platform.python_implementation(), + sys.version, + sys.hexversion, + u.id, + ) diff --git a/src/nancy/fs.py b/src/nancy/fs.py index cdfff09..b4b03ea 100644 --- a/src/nancy/fs.py +++ b/src/nancy/fs.py @@ -47,18 +47,18 @@ def remove_write_perms(path): def make_readonly_recursive(path, excluded=[]): """Recursively "freeze" a directory by setting all files and directories read-only""" # traversing bottom-up makes it easier to freeze perms on directories - for root, dirs, files in os.walk(self.path, topdown=False): + for root, dirs, files in os.walk(path, topdown=False): for f in files: p = os.path.join(root, f) if p in excluded: continue - remove_write_perms(os.path.join(self.path, p)) + remove_write_perms(os.path.join(path, p)) for d in dirs: p = os.path.join(root, d) if p in excluded: continue - remove_write_perms(os.path.join(self.path, p)) + remove_write_perms(os.path.join(path, p)) @dataclass diff --git a/src/nancy/machine.py b/src/nancy/machine.py index ac52420..d971bfd 100644 --- a/src/nancy/machine.py +++ b/src/nancy/machine.py @@ -1,50 +1,95 @@ -from collections import namedtuple +from typing import NamedTuple import json import platform import time -MachineInfo = namedtuple( - 'MachineInfo', - [ - 'machine_id', - 'hostname', - 'processor', - 'system', - 'release', - 'cpu_type', - 'timezone', - 'freedesktop_os_release', - 'win32_ver', - 'mac_ver', - ], -) -def get_machine_info(): - fdor = '' - try: - fdor = json.dumps(platform.freedesktop_os_release()) - except AttributeError: - # freedesktop_os_release only available for python >= 3.10 +class Machine(NamedTuple): + id: int + machine_id: str + hostname: str + processor: str + system: str + release: str + cpu_type: str + timezone: str + freedesktop_os_release: str + win32_ver: str + mac_ver: str + + @classmethod + def find_or_insert(cls, cur, machine=None): + """Given a DB cursor, find or create row in machine table and fill""" + if machine is None: + machine = cls.detect() + + # insert or ignore, handle each case to set id + cur.execute(''' + SELECT + id + FROM + machine + WHERE + machine_id = ? AND + hostname = ? AND + processor = ? AND + system = ? AND + release = ? AND + cpu_type = ? AND + timezone = ? AND + freedesktop_os_release = ? AND + win32_ver = ? AND + mac_ver = ? + LIMIT 1 + ''', + machine[1:] + ) + res = cur.fetchone() + if res is None: + cur.execute(''' + INSERT INTO machine VALUES (?,?,?,?,?,?,?,?,?,?,?); + ''', + machine, + ) + id = cur.lastrowid + cur.connection.commit() + else: + id = res[0] + + return machine._replace(id=id) + + @classmethod + def detect(cls): + """Formats machine-specific information into a MachineInfo object. + + Note that 'MachineInfo' objects are properly formatted to be inserted into + the `machine` table. + """ fdor = '' - system = platform.system() - - mid = None - if system == 'Linux': try: - mid = open('/etc/machine-id', 'r').read() - except FileNotFoundError: - pass + fdor = json.dumps(platform.freedesktop_os_release()) + except AttributeError: + # freedesktop_os_release only available for python >= 3.10 + fdor = '' + system = platform.system() - return MachineInfo( - machine_id=mid, - hostname=platform.node(), - processor=platform.processor(), - system=system, - release=platform.release(), - cpu_type=platform.machine(), - timezone=time.tzname[time.daylight], - freedesktop_os_release=fdor, - win32_ver=json.dumps(platform.win32_ver()), - mac_ver=json.dumps(platform.mac_ver()), - ) + mid = None + if system == 'Linux': + try: + mid = open('/etc/machine-id', 'r').read() + except FileNotFoundError: + pass + return cls( + None, + machine_id=mid, + hostname=platform.node(), + processor=platform.processor(), + system=system, + release=platform.release(), + cpu_type=platform.machine(), + timezone=time.tzname[time.daylight], + freedesktop_os_release=fdor, + win32_ver=json.dumps(platform.win32_ver()), + mac_ver=json.dumps(platform.mac_ver()), + ) diff --git a/src/nancy/schema/version0.sql b/src/nancy/schema/version0.sql index 4641980..d56a7f8 100644 --- a/src/nancy/schema/version0.sql +++ b/src/nancy/schema/version0.sql @@ -35,7 +35,11 @@ CREATE TABLE machine(id INTEGER PRIMARY KEY NOT NULL, -- 'VERSION': '22.05 (Quokka)', 'VERSION_CODENAME': 'quokka', 'VERSION_ID': -- '22.05'}" win32_ver TEXT, -- platform.win32_ver() as JSON - mac_ver TEXT -- platform.mac_ver() as JSON + mac_ver TEXT, -- platform.mac_ver() as JSON + + -- disallow duplicate rows + UNIQUE(hostname, processor, system, release, cpu_type, timezone, + freedesktop_os_release, win32_ver, mac_ver) ); -- Programs are run by users on machines CREATE TABLE user(id INTEGER PRIMARY KEY NOT NULL, @@ -45,7 +49,11 @@ CREATE TABLE user(id INTEGER PRIMARY KEY NOT NULL, -- on Windows: see https://stackoverflow.com/questions/21766954/how-to-get-windows-users-full-name-in-python machine INTEGER NOT NULL, - UNIQUE(userid, machine), + -- usernames and fullnames change rarely, but it happens. A more restrictive + -- constraint would not enable tracking of this info. The following only + -- guarantees no duplicate full rows. + UNIQUE(username, userid, fullname, machine), + FOREIGN KEY (machine) REFERENCES machine (id) ON UPDATE CASCADE ); @@ -118,6 +126,9 @@ CREATE TABLE environment (id INTEGER PRIMARY KEY NOT NULL, python_hexversion INTEGER, -- sys.hexversion: 50923504 user INTEGER, + -- disallow duplicate rows + UNIQUE(envvars_json, python_implementation, python_strversion, python_hexversion, user), + FOREIGN KEY (user) REFERENCES user (id) ON UPDATE CASCADE ); @@ -131,7 +142,7 @@ CREATE TABLE program (id INTEGER PRIMARY KEY NOT NULL, name TEXT, -- name of the program, usually written lowercase by calling code e.g. cnn_crossval -- Names of built-in operations will be shown in upper case: e.g. 'FREEZE' - -- we use POSIX timestamps in UTC for time recording. + -- we use POSIX timestamps for time recording. -- e.g. datetime.datetime.now().timestamp() start_time REAL, end_time REAL, @@ -139,6 +150,7 @@ CREATE TABLE program (id INTEGER PRIMARY KEY NOT NULL, process_id INTEGER, -- host PID of python process on host OS environment INTEGER NOT NULL, message TEXT, -- user-defined message to help distinguish similar runs + FOREIGN KEY (environment) REFERENCES environment (id) ON UPDATE CASCADE ); @@ -166,7 +178,7 @@ CREATE TABLE py_module(id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE func(id INTEGER PRIMARY KEY NOT NULL, name TEXT NOT NULL, module INTEGER NOT NULL, - -- we use POSIX timestamps in UTC for time recording. + -- we use POSIX timestamps for time recording. -- e.g. datetime.datetime.now().timestamp() start_time REAL, end_time REAL, diff --git a/src/nancy/store.py b/src/nancy/store.py index 8bf9108..2b50c08 100644 --- a/src/nancy/store.py +++ b/src/nancy/store.py @@ -1,14 +1,59 @@ """Utilities for creating new stores and linking between them.""" -from . import db, fs, machine +from . import db, environment, fs, machine +import datetime import importlib import json import os from pathlib import Path import sqlite3 +from typing import Callable +class Program: + def __init__(self, store, name, message): + self.store = store + self.name = name + self.message = message + + self._evaluated = False + + def set_start_time(self, t): + self.start_time = t + + def __enter__(self): + if self._evaluated: + raise RuntimeError("Cannot re-enter a Program context") + + cur = self.store.conn.cursor() + env = environment.Environment.find_or_insert(cur) + + cur.execute('INSERT INTO program VALUES (?, ?, ?, ?, ?, ?, ?)', ( + None, #id INTEGER PRIMARY KEY NOT NULL, + self.name, #name TEXT, -- name of the program, usually written lowercase by calling code e.g. cnn_crossval + + #-- we use POSIX timestamps for time recording. + #-- e.g. datetime.datetime.now().timestamp() + None, #start_time REAL, + None, #end_time REAL, + + os.getpid(), #process_id INTEGER, -- host PID of python process on host OS + env.id, #environment INTEGER NOT NULL, + self.message, #message TEXT, -- user-defined message to help distinguish similar runs + )) + + self.set_start_time(datetime.datetime.now()) + # track this program in the store + + return self + + def __exit__(self, exc_type, exc_value, exc_traceback): + elapsed = datetime.datetime.now() - self.start_time + # record start and end times in store + + self._evaluated = True # prevent re-running + class Store: """Describes a data directory, holds active connection to nancy.db""" @@ -45,7 +90,8 @@ class Store: return self.conn @classmethod - def init(cls, directory=None): + def init(cls, directory=None, message=None): + start_time = datetime.datetime.now() if directory is None: # initialize an in-memory store db_path = ':memory:' else: @@ -63,18 +109,12 @@ class Store: db.init_schema(conn.cursor()) new_store = cls(directory, conn) - new_store.record_machine_description() + with new_store.program('INIT', message) as p: + # set the timing to the actual times it took to initialize the db + p.set_start_time(start_time) return new_store - def record_machine_description(self): - """Record machine-specific information""" - cur = self.conn.cursor() - machine_info = machine.get_machine_info() - cur.execute('INSERT INTO machine VALUES(NULL,?,?,?,?,?,?,?,?,?,?)', machine_info) - self.machine_id = cur.lastrowid - self.conn.commit() - def make_readonly(self): """Make store directory read-only (except for nancy.db) and return file list""" fs.make_readonly_recursive(self.path, excluded='./nancy.db') @@ -90,48 +130,34 @@ class Store: else: return fs.FSEntry.from_db_index(cur, root_id=root_id) + def program(self, name, message=None): + return Program(self, name, message) - def freeze(self): - # Remove write permissions on all files except nancy.db - # This also catalogs all files, directories, and symlinks - # Note that ./nancy.db is NOT listed in files and is not altered - dirs, files, symlinks, irreg = self.make_readonly() - if len(irreg) > 0: - raise Exception( - "Refusing to freeze directory containing irregular files: " - ", ".join(irreg), - ) - for p, perms in symlinks.items(): - target = os.readlink(p) # gives possibly relative path to target - resolved = os.path.realpath(p) - if not resolved.startswith(os.path.abspath(self.path)): - raise Exception( - f"Freezing {p} failed since it points to location " - f"{target} which is outside store path ({self.path})", - ) + def record(self, message=None): + with self.program('RECORD', message) as p: + # get hashes of current directory (recursive) + current = fs.FSEntry.from_path(self.path) - # get hashes of current directory (recursive) - current = fs.FSEntry.from_path(self.path) + # extract hashes for all previously-cataloged files/dirs + recorded = self.fs_entries(shallow=True) - # extract hashes for all previously-cataloged files/dirs - recorded = self.fs_entries(shallow=True) + d = current.diff(recorded) - d = current.diff(recorded) + # select all dirs - # select all dirs + # detect new and deleted files, and those that have changed type - # detect new and deleted files, and those that have changed type + # create entries for all directories if they do not yet exist (top + # down) - # create entries for all directories if they do not yet exist (top - # down) + # insert files and symlinks into store_file, computing checksums on + # each - # insert files and symlinks into store_file, computing checksums on - # each + # update versions in nancy.db as appropriate - # update versions in nancy.db as appropriate - - # remove write permissions on nancy.db - remove_write_perms(os.path.join(self.path, 'nancy.db')) + #p.unlink_file() + #p.record_file() + # all effected files #@contextmanager def run( diff --git a/src/nancy/user.py b/src/nancy/user.py new file mode 100644 index 0000000..d9ec5f7 --- /dev/null +++ b/src/nancy/user.py @@ -0,0 +1,72 @@ +from . import machine, store + +import getpass +import os +import pwd +from typing import NamedTuple + +class User(NamedTuple): + id: int # if not None, this is `id` in the `machine` table + username: str + userid: int + fullname: str + machine: machine.Machine + + @classmethod + def find_or_insert(cls, cur, user=None): + """Given a DB cursor, find or create row in user table and fill""" + if user is None: + user = cls.detect() + + m = machine.Machine.find_or_insert(cur) + + user = user._replace(machine=m.id) + + # insert or ignore, handle each case to set id + cur.execute(''' + SELECT + id + FROM + user + WHERE + username = ? AND + userid = ? AND + fullname = ? AND + machine = ? + LIMIT 1 + ''', + user[1:], + ) + res = cur.fetchone() + if res is None: + cur.execute(''' + INSERT INTO user VALUES (?,?,?,?,?); + ''', + user, + ) + id = cur.lastrowid + cur.connection.commit() + else: + id = res[0] + + return user._replace(id=id) + + @classmethod + def detect(cls): + """Detect values for user independent of the database. + + Note that the machine entry will not have a valid id. + """ + # TODO: will this fail on Windows/OSX? + fullname = pwd.getpwuid(os.getuid()).pw_gecos + + m = machine.Machine.detect() + + return cls( + None, + getpass.getuser(), + os.getuid(), + fullname, + m.id, + ) + diff --git a/tests/test_db.py b/tests/test_db.py index 48ccedd..12c16e9 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -115,7 +115,7 @@ def test_invalid_user_machine(insert_user): None, #id INTEGER PRIMARY KEY NOT NULL, 'jacob', #username TEXT NOT NULL, 101, #userid INTEGER, - 'Bozo the Clown', #fullname TEXT, + 'Jacob Hinkle', #fullname TEXT, 1, #machine INTEGER NOT NULL, ), )