diff --git a/src/nancy/cli/__init__.py b/src/nancy/cli/__init__.py index 87b5313..0308404 100644 --- a/src/nancy/cli/__init__.py +++ b/src/nancy/cli/__init__.py @@ -19,10 +19,20 @@ class AliasedGroup(click.Group): ctx.fail("Too many matches: %s" % ", ".join(sorted(matches))) -@click.group(f"nancy v{__version__}", cls=AliasedGroup) +@click.command() +def version(): + """Print version information.""" + print(f"nancy v{__version__}") + + +@click.group( + cls=AliasedGroup, + help=f"Composable provenance tracking for scientific data") def main(): + """ + + """ pass - - main.add_command(freeze) main.add_command(thaw) +main.add_command(version) diff --git a/src/nancy/cli/freeze.py b/src/nancy/cli/freeze.py index 77c2a7d..18e2867 100644 --- a/src/nancy/cli/freeze.py +++ b/src/nancy/cli/freeze.py @@ -2,27 +2,43 @@ import click from .. import store +import os + @click.command() @click.argument("directory") def freeze(directory): + """ + Initialize tracking in a directory or freeze a tracked directory. + + If DIRECTORY is not already part of an existing nancy store, then a new + 'nancy.db' file is created in that directory. On the other hand, if the + directory is part of an existing store, it will be re-frozen and versions + of any files changes since thawing will be incremented. + """ if not os.path.isdir(directory): raise ValueError(f"Cannot freeze non-existent directory {directory}") - existing_store = db.find_store(directory) + existing_store = store.find_store(directory) if existing_store is None: # this is a new store - s = store.init(directory) + s = store.Store.init(directory) else: # this is an existing store - store.connect(directory) + s = store.Store(directory) - # remove write permissions on all files except nancy.db - - # compute checksums on all files, update versions in nancy.db as appropriate - - # remove write permissions on nancy.db + s.freeze() @click.command() @click.argument("files", nargs=-1) # , help="Files or directories to thaw.") def thaw(files): + """ + Enable manual alteration of files within a tracked directory. + + This command is meant to be used in conjunction with the 'freeze' + subcommand. After thawing, changes may be made in the current directory, + after which `nancy freeze` should be run changes may be made in the current + directory, after which `nancy freeze` should be run. At that point, changes + will be recorded: new files will be detected and modified files will have + their version numbers incremented. + """ pass diff --git a/src/nancy/store.py b/src/nancy/store.py index 0c28a50..4515e5b 100644 --- a/src/nancy/store.py +++ b/src/nancy/store.py @@ -1,6 +1,206 @@ """Utilities for creating new stores and linking between them.""" +from . import db + +import importlib +import json import os +from pathlib import Path +import sqlite3 +import stat + + +def remove_write_perms(path): + """Remove write permissions for all users while preserving other perms""" + s = os.stat(path) + orig_perm_string = stat.filemode(s) + os.chmod( + path, + s.st_mode ^ stat.S_IWUSR ^ stat.S_IWGRP ^ stat.S_IWOTH, + follow_symlinks=False, + ) + return orig_perm_string + + +class Store: + """Describes a data directory, holds active connection to nancy.db""" + + def __init__(self, directory=None, conn=None): + """ + Arguments: + directory (str): Location of existing store directory. If omitted + or None, initialize a store in memory, with no associated + directory. + """ + if directory is None: + self.path = None + self.db_path = ':memory:' + else: + self.path = Path(directory) + self.db_path = self.path / "nancy.db" + if conn is None: + self.connect() + else: + self.conn = conn + + def copy(self, store_path): + """Copy this store to a new store path""" + dst_db_path = os.path.join(store_path, "nancy.db") + dst_conn = sqlite3.connect(dst_db_path) + self.conn.backup(dst_conn) + dst_conn.close + return self.__class__(store_path) + + def connect(self): + self.conn = sqlite3.connect(self.db_path) + self.conn.cursor().execute("PRAGMA foreign_keys = ON;") + return self.conn + + @classmethod + def init(cls, directory=None): + if directory is None: # initialize an in-memory store + db_path = ':memory:' + else: + if not os.path.isdir(directory): + raise FileNotFoundError( + f"Directory {directory} must exist before initializing a store there.", + ) + db_path = os.path.join(directory, 'nancy.db') + if os.path.isfile(db_path): + raise FileExistsError( + f"File {db_path} exists. Refusing to re-initialize", + ) + # initialize a database in the target directory + schema = importlib.resources.open_text("nancy", "schema.sql").read() + conn = sqlite3.connect(db_path) + conn.cursor().executescript(schema) + new_store = cls(directory, conn) + + new_store.record_machine_description() + + return new_store + + def record_machine_description(self): + """Record machine-specific information""" + import platform, time + cur = self.conn.cursor() + fdor = '' + try: + fdor = json.dumps(platform.freedesktop_os_release()) + except AttributeError: + # freedesktop_os_release only available for python >= 3.10 + fdor = '' + machine_info = ( + platform.node(), + platform.processor(), + platform.system(), + platform.release(), + platform.machine(), + time.tzname[time.daylight], + fdor, + json.dumps(platform.win32_ver()), + json.dumps(platform.mac_ver()), + ) + cur.execute('INSERT INTO machine VALUES(NULL,?,?,?,?,?,?,?,?,?)', machine_info) + self.conn.commit() + + def make_readonly(self): + """Make store directory read-only (except for nancy.db) and return file list""" + dirpaths = {} + regfilepaths = {} + symlinkpaths = {} + irregfilepaths = {} # any file that's not regular or symlink. e.g. inodes or sockets + + # traversing bottom-up makes it easier to freeze perms on directories + for root, dirs, files in os.walk(self.path, topdown=False): + for f in files: + p = os.path.join(root, f) + if p == './nancy.db': + continue + + orig_perms = remove_write_perms(os.path.join(self.path, p)) + + if not os.path.isfile(p): + # not a link or regular file + irregfilepaths[p] = orig_perms + + if os.path.islink(p): + symlinkpaths[p] = orig_perms + else: + regfilepaths[p] = orig_perms + + for d in dirs: + p = os.path.join(root, d) + + orig_perms = remove_write_perms(os.path.join(self.path, p)) + + dirpaths[p] = orig_perms + return dirpaths, regfilepaths, symlinkpaths, irregfilepaths + + def file_hashes(self, local_only=True): + """Get recorded hashes of all files currently tracked in this store""" + + + def freeze(self): + # Remove write permissions on all files except nancy.db + # This also catalogs all files, directories, and symlinks + # Note that ./nancy.db is NOT listed in files and is not altered + dirs, files, symlinks, irreg = self.make_readonly() + if len(irreg) > 0: + raise Exception( + "Refusing to freeze directory containing irregular files: " + ", ".join(irreg), + ) + for p, perms in symlinks.items(): + target = os.path.readlink(p) # gives possibly relative path to tgt + resolved = os.path.realpath(p) + if not resolved.startswith(os.path.abspath(self.path)): + raise Exception( + f"Freezing {p} failed since it points to location " + f"{target} which is outside store path ({self.path})", + ) + + # compute checksums on all files + + # detect new and deleted files, and those that have changed type + + # update versions in nancy.db as appropriate + + # remove write permissions on nancy.db + remove_write_perms(os.path.join(self.path, 'nancy.db')) + + #@contextmanager + def run( + self, + name=None, + message=None, + ): + """ + Create a context manager that encapsulates a procedure that can save files. + + Note that this does NOT spawn any new OS processes or threads. + + Example: + + s = nancy.store.init(target_directory) + with s.run("sum_dataframe") as f: + x = PandasDataframe() + y = Sum(x) + f.save('stats/xsum.csv', y) + """ + pass + + +class StoreFile: + """Describes a file that is recorded in the store.""" + + def __init__(self, store, rel_path): + self.store = store + self.rel_path = rel_path + + def save(self): + # call the appropriate save method + pass def find_store(path): @@ -14,7 +214,4 @@ def find_store(path): newd = os.path.dirname(path) if newd == d: # indicates this is a filesystem root like C:\ or / return None - - -def connect(path): - """Given directory, connect to its store""" + d = newd