diff --git a/poetry.lock b/poetry.lock index 044f84c..6a79068 100644 --- a/poetry.lock +++ b/poetry.lock @@ -263,7 +263,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest- [metadata] lock-version = "1.1" python-versions = "^3.7" -content-hash = "49ef6dea324178124adc9639e6329fff4411087ca60907b9661d138d6b6e88aa" +content-hash = "c44b5e718866e498815a184650d98fd5e637b07246375d1e9fc4298a0125f0d6" [metadata.files] attrs = [ diff --git a/pyproject.toml b/pyproject.toml index d3b59aa..e94519a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ authors = ["Jacob Hinkle "] [tool.poetry.dependencies] python = "^3.7" click = "^8.1.3" +colorama = "^0.4.5" [tool.poetry.dev-dependencies] black = "^22.8.0" diff --git a/src/nancy/cli/__init__.py b/src/nancy/cli/__init__.py index 26e008b..50a9fdd 100644 --- a/src/nancy/cli/__init__.py +++ b/src/nancy/cli/__init__.py @@ -3,6 +3,7 @@ import click from ..version import __version__ #from .freeze import freeze, thaw +from . import diff from . import record @@ -30,11 +31,9 @@ def version(): cls=AliasedGroup, help=f"Composable provenance tracking for scientific data") def main(): - """ - - """ pass #main.add_command(freeze) #main.add_command(thaw) +main.add_command(diff.diff_cli, name='diff') main.add_command(record.record_cli, name='record') main.add_command(version) diff --git a/src/nancy/cli/diff.py b/src/nancy/cli/diff.py new file mode 100644 index 0000000..d836182 --- /dev/null +++ b/src/nancy/cli/diff.py @@ -0,0 +1,84 @@ +import click + +from .. import db, fs, store + +import os +import sys +import warnings + + +def print_diff(ABdiff: fs.FSDiff, version_info='count', indent=2, + indent_level=0, use_color=True): + """Pretty print an FSDiff object""" + if use_color: + try: + from colorama import Fore, Back, Style + except ImportError: + warnings.warn("Could not import colorama library. Color output disabled.") + use_color = False + + changetags = dict( + NEW=Fore.GREEN + 'NEW' + Style.RESET_ALL if use_color else 'NEW', + DEL=Fore.RED + 'DEL' + Style.RESET_ALL if use_color else 'DEL', + MOD=Fore.YELLOW + 'MOD' + Style.RESET_ALL if use_color else 'MOD', + ) + + for l, d in ABdiff.flatten_tree(): + if d.A is None: + assert d.B is not None + print(changetags['NEW'], d.B.unfrozen_perms, + #(indent_level + l) * indent * ' ' + d.B.filename, + d.B.relpath[2:], + ) + elif d.B is None: + print(changetags['DEL'], d.A.unfrozen_perms, (indent_level + l) * indent * ' ' + d.A.filename) + else: + print(changetags['MOD'], d.B.unfrozen_perms, (indent_level + l) * indent * ' ' + d.B.filename) + + +def diff(store, filedir_path, version_info='count'): + """Unwrapped diff command that prints a diff""" + if not os.path.exists(filedir_path): + raise FileNotFoundError(f"Cannot diff non-existent file or directory {filedir_path}") + + # get the diff object + fsdiff = store.diff(filedir_path) + + print_diff(fsdiff, version_info=version_info) + + +@click.command() +@click.argument("path", default='.') +@click.option( + "--version-info", + type=click.Choice(['count', 'details', 'none'], case_sensitive=False), + default='count', + help='How much info to display about file versions.', +) +def diff_cli(path, version_info): + """Detect and describe changes to PATH + + PATH is a path to a file or directory inside an existing nancy store + (a directory containing a file called "nancy.db". If no "nancy.db" is + detected in this directory it is searched for up the filesystem hierarchy. + However, the printed diff information pertains to the given value of + PATH. + """ + storepath = store.find_store(path) + + if storepath is None: + print("Could not find nancy.db in any directory containing " + + str(os.path.realpath(path)), file=sys.stderr) + sys.exit(1) + + print("Paths relative to store path:", storepath) + + # connect to store + s = store.Store(storepath) + + try: + diff(store=s, filedir_path=path) + except FileNotFoundError as e: + print(str(e), file=sys.stderr) + sys.exit(1) + diff --git a/src/nancy/fs.py b/src/nancy/fs.py index b87dcd9..c6a681d 100644 --- a/src/nancy/fs.py +++ b/src/nancy/fs.py @@ -61,32 +61,43 @@ def make_readonly_recursive(path, excluded=[]): remove_write_perms(os.path.join(path, p)) +@dataclass +class FSEntryVersion: + """A hashed file or directory.""" + unfrozen_perms: str # stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' + symlink_target:str # if this is a symlink, this is the (read but not fully + # resolved) target. I.e. this is the "content" of the symlink. + sha256: str + id: int = None # should not be null + version_counter: int = None # incremented whenever a new version of the file is recorded + source_task_id: int = None + + @dataclass class FSEntry: """A hashed file or directory.""" - name: str # with parent directory stripped. None if this is the root + id: int # defaults to None + filename: str # with parent directory stripped. None if this is the root relpath: str # relative to some root directory # children for dirs only: non-recursive; files/dirs at this level only children: List['FSEntry'] filetype: str # regular, symlink, special (block, char, pipe, or socket) - perms: str # e.g. '-rw-rw-r--' - # For directories, sha256 is computed from concatenation of all children - # sorted alphabetically regardless of type using "{name}{hash}" format - sha256: bytes # sha256 of content for regular files, or of target for links + deleted: bool + versions: [FSEntryVersion] = [] + + # these will be filled from the version list automatically + unfrozen_perms: str = None # stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' + symlink_target: str = None # if this is a symlink, this is the (read but not fully + # resolved) target. I.e. this is the "content" of the symlink. + sha256: str = None + latest_version: FSEntryVersion = None def __post_init__(self): - # compute sha256 if it's missing - if self.sha256 is None: - m = hashlib.sha256() - for d in self.subdirs: - assert d.sha256 is not None - m.update(bytes(d.dirname, 'utf-8')) - m.update(d.sha256) - for f in self.files: - assert f.sha256 is not None - m.update(bytes(f.filename, 'utf-8')) - m.update(f.sha256) - self.sha256 = m.digest() + if len(self.versions) > 0: + self.latest_version = self.versions[-1] + self.unfrozen_perms = self.latest_version.unfrozen_perms + self.symlink_target = self.latest_version.symlink_target + self.sha256 = self.latest_version.sha256 @classmethod def from_path(cls, root, relpath='.', filestat=None): @@ -101,11 +112,12 @@ class FSEntry: s = filestat.st_mode children = [] + symlink_target = None if stat.S_ISLNK(s): # Check links first, since it is not exclusive with dir or file checks filetype = 'LNK' - m.update(bytes(os.readlink(path), 'utf-8')) - print(path, "is link") + symlink_target = os.readlink(path) + m.update(bytes(symlink_target, 'utf-8')) elif stat.S_ISDIR(s): filetype = 'DIR' @@ -128,6 +140,11 @@ class FSEntry: for e in direntries ] for c in children: # now hash concatenated sorted hashes + # hash on perms+sha256 to enable recursively detecting perm + # changes without modifying the hashes of individual files, + # which remain content-based for compatibility with + # other tools + m.update(bytes(c.unfrozen_perms, 'utf-8')) m.update(c.sha256) elif stat.S_ISREG(s): filetype = 'REG' @@ -152,53 +169,106 @@ class FSEntry: sha256 = m.digest() return cls( - name=os.path.basename(relpath), + id=None, + filename=os.path.basename(relpath), relpath=relpath, children=children, filetype=filetype, - perms=stat.filemode(filestat.st_mode), - sha256=sha256, + deleted=False, + versions=[ + FSEntryVersion( + unfrozen_perms=stat.filemode(filestat.st_mode), + symlink_target=symlink_target, + sha256=sha256, + ) + ], ) @classmethod def empty_root(cls): """Just a standardized value indicating an empty root directory""" return cls( - name='.', + id=None, + filename='.', relpath='.', children=[], filetype='DIR', perms='----------', sha256=hashlib.sha256().digest(), + deleted=False, ) @classmethod def from_db_index(cls, cursor, root_id=None, root_row=None): """Given id of an entry in filedir, recursively fill this object""" - fields = '(id, filename, filetype, unfrozen_perms, frozen, sha256, symlink_target)' + fields = 'id, filename, filetype, frozen, deleted' if root_row is None: - cursor.execute(f'SELECT {fields} FROM filedir WHERE id={root_id};') + assert root_id is not None + cursor.execute(f'SELECT {fields} FROM filedir WHERE id=?', root_id) root_row = cursor.fetchone() - root_id, name, filetype, unfrozen_perms, frozen, sha256, symlink_target = root_row - # get children - cursor.execute(f'SELECT {fields} FROM filedir WHERE parent={root_id} ORDER BY name;') + root_id, filename, filetype, frozen, deleted = root_row + + cursor.execute(f'SELECT {fields} FROM filedir WHERE parent=? ORDER BY name', root_id) rows = cursor.fetchall() + children = [cls.from_db_index(cursor, root_row=r) for r in rows] + + fields = 'id, version_counter, unfrozen_perms, symlink_target, sha256, source_task_id' + cursor.execute(f'SELECT {fields} FROM filedir_version WHERE filedir=? ORDER BY version', self.id) + matches = cursor.fetchall() + versions = [FSEntryVersion(*row) for row in matches] + return cls( - name=name, + id=root_id, + filename=filename, relpath=relpath, - children=[cls.from_db_index(cursor, root_row=r) for r in rows], + children=children, filetype=filetype, perms=unfrozen_perms, sha256=bytes.fromhex(sha256), + deleted=deleted, + versions=versions, ) - def diff(self, other): - return FSDiff(self, other) + def versions(self, cursor): + """Get a list from the database of FSEntryVersion objects.""" + if self.id is None: + return None + + def flatten_tree(self, level=0): + """Return list of all entries, with level, in pairs""" + pairs = [(level, self)] + for c in sorted(self.children, key=lambda e: e.filename): + pairs.extend(c.flatten_tree(level=level + 1)) + return pairs +def sort_diffs_filename(diffs): + name_ent = {e.filename(): e for e in diffs} + return [name_ent[n] for n in sorted(name_ent.keys())] + + +@dataclass class FSDiff: - def __init__(self, A, B): - """Given two hashed directories, efficiently compute difference. + A: FSEntry # record the comparisons + B: FSEntry # a missing entry indicates new or deleted + modified_children: 'FSDiff' + + @staticmethod + def compare(A, B): + return A.sha256 == B.sha256 and \ + A.perms == B.perms and \ + A.filetype == B.filetype and \ + A.deleted == B.deleted + + def filename(self): + return (self.B.filename if self.A is None else self.A.filename) + + def filetype(self): + return (self.B.filetype if self.A is None else self.A.filetype) + + @classmethod + def compute(cls, A, B): + """Given two hashed directories, recursively compute difference. This assumes the hashes are consistent, so that directories with matching hashes need not be inspected deeply. @@ -208,5 +278,39 @@ class FSDiff: modified (Directory): overlay with modified entries from other new (Directory): overlay with new entries from other """ - self.A = A - self.B = B + if A is None: # new entry + return cls(A, B, [ + cls.compute(None, c) \ + for c in sorted(B.children, key=lambda e: e.filename) + ]) + if B is None: # deleted entry + return cls(A, B, [ + cls.compute(c, N) \ + for c in sorted(A.children, key=lambda e: e.filename) + ]) + + if cls.compare(A, B): + # no need to check descendents + return cls(A, B, []) + + # there are modifications at this level, not just insert/delete + + # list children and find overlaps + Alist = {c.filename: c for c in A.children} + Blist = {c.filename: c for c in B.children} + + allnames = set(Alist.keys() + Blist.keys()) + + modified_children = [cls.compute( + Alist.get(n, None), + Blist.get(n, None), + ) for n in allnames] + + return cls(A, B, modified_children) + + def flatten_tree(self, level=0): + """Return list of all entries, with level, in pairs""" + pairs = [(level, self)] + for c in sorted(self.modified_children, key=lambda d: d.filename()): + pairs.extend(c.flatten_tree(level=level + 1)) + return pairs diff --git a/src/nancy/schema/version0.sql b/src/nancy/schema/version0.sql index c777ab7..38d2e97 100644 --- a/src/nancy/schema/version0.sql +++ b/src/nancy/schema/version0.sql @@ -91,7 +91,7 @@ CREATE TABLE user(id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE store (id INTEGER PRIMARY KEY NOT NULL, machine INTEGER, dbpath TEXT NOT NULL, - uuid TEXT NOT NULL, -- str(uuid.uuid4()) + uuid TEXT NOT NULL, -- UUID generated by str(uuid.uuid4()) imported BOOL, -- is this the store for the current directory? If not, then it was -- imported in order to describe a dependency. @@ -152,7 +152,7 @@ END; END; -- potentially a datum). CREATE TABLE filedir_version (id INTEGER PRIMARY KEY NOT NULL, filedir INTEGER REFERENCES filedir ON UPDATE CASCADE, -- parent filedir entry - version INTEGER NOT NULL, -- incremented whenever a new version of the file is recorded + version_counter INTEGER NOT NULL, -- incremented whenever a new version of the file is recorded -- We record the permissions on each file, in a way that enables reloading -- permissions properly when thawing after a freeze operation. diff --git a/src/nancy/store.py b/src/nancy/store.py index 4ac4247..1527a58 100644 --- a/src/nancy/store.py +++ b/src/nancy/store.py @@ -135,12 +135,49 @@ class Store: """Make store directory read-only (except for nancy.db) and return file list""" fs.make_readonly_recursive(self.path, excluded='./nancy.db') + def filedir_root_index(self, cur=None): + """Get the database id for the table entry in this store having name '.'""" + if cur is None: + cur = self.conn.cursor() + cur.execute('SELECT id FROM filedir WHERE store=0 AND parent is NULL;') + return cur.fetchone() + + def path_to_fsentry(self, path): + """Find a path in the filedir database and return it as an fsentry. + + If the path is not found in the store, None is returned. + """ + cur = self.conn.cursor() + + # get relative path to resolved path + rel = os.path.relpath(os.path.realpath(path), start=os.path.realpath(self.path)) + + # rel tells us how to descend recurively to find the filedir for path + fd_id = self.filedir_root_index(cur) + if fd_id is None: + # Root isn't even inserted into the db yet + return None + + for p in Path(rel).parts: # Path.parts splits a path reliably + # get child with that name + cur.execute( + 'SELECT id, filetype FROM filedir WHERE filename=? AND parent=? LIMIT 1', + (p, fd_id), + ) + row = cur.fetchone() + + if row is None: + return None + + fd_id, filetype = row + if filetype != 'DIR': + return fd_id + return fs.FSEntry.from_db_index(cur, root_id=fd_id) + + def fs_entries(self, shallow=False): """Return recursive structure containing FSEntry objects from db""" - # get the database id for the table entry in this store having name '.' - cur = self.conn.cursor() - cur.execute('SELECT id FROM filedir WHERE store=0 AND parent is NULL;') - root_id = cur.fetchone() + root_id = self.filedir_root_index() if root_id is None: return fs.FSEntry.empty_root() else: @@ -149,15 +186,26 @@ class Store: def program(self, name, message=None): return Program(self, name, message) + def diff(self, filepath): + """ + Diff a file or directory compared to its recorded version + """ + # get info about current files at this location + current = fs.FSEntry.from_path( + self.path, + # send a relative path from self.path to filepath, which is user + # provided + os.path.relpath(os.path.realpath(filepath), os.path.realpath(self.path)), + ) + + recorded = self.path_to_fsentry(filepath) + + return fs.FSDiff.compute(recorded, current) + def record(self, message=None): with self.program('RECORD', message) as p: - # get hashes of current directory (recursive) - current = fs.FSEntry.from_path(self.path) - # extract hashes for all previously-cataloged files/dirs - recorded = self.fs_entries(shallow=True) - - d = current.diff(recorded) + d = self.diff(self.path) # select all dirs @@ -217,7 +265,7 @@ def find_store(path): while True: if os.path.exists(os.path.join(d, "nancy.db")): return d - newd = os.path.dirname(path) + newd = os.path.dirname(d) if newd == d: # indicates this is a filesystem root like C:\ or / return None d = newd