From 183e3ed61386034007c0b2a294bf11f9a33e6b4e Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 23 Sep 2022 20:12:10 -0400 Subject: [PATCH] Add working FSDiff and new diff command. This uses colorama so that is added to poetry. Next I'll add the diff printing to the record command and have the corresponding store method use the diff as input. --- poetry.lock | 2 +- pyproject.toml | 1 + src/nancy/cli/__init__.py | 5 +- src/nancy/cli/diff.py | 84 ++++++++++++++++ src/nancy/fs.py | 176 +++++++++++++++++++++++++++------- src/nancy/schema/version0.sql | 4 +- src/nancy/store.py | 70 +++++++++++--- 7 files changed, 289 insertions(+), 53 deletions(-) create mode 100644 src/nancy/cli/diff.py diff --git a/poetry.lock b/poetry.lock index 044f84c..6a79068 100644 --- a/poetry.lock +++ b/poetry.lock @@ -263,7 +263,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest- [metadata] lock-version = "1.1" python-versions = "^3.7" -content-hash = "49ef6dea324178124adc9639e6329fff4411087ca60907b9661d138d6b6e88aa" +content-hash = "c44b5e718866e498815a184650d98fd5e637b07246375d1e9fc4298a0125f0d6" [metadata.files] attrs = [ diff --git a/pyproject.toml b/pyproject.toml index d3b59aa..e94519a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ authors = ["Jacob Hinkle "] [tool.poetry.dependencies] python = "^3.7" click = "^8.1.3" +colorama = "^0.4.5" [tool.poetry.dev-dependencies] black = "^22.8.0" diff --git a/src/nancy/cli/__init__.py b/src/nancy/cli/__init__.py index 26e008b..50a9fdd 100644 --- a/src/nancy/cli/__init__.py +++ b/src/nancy/cli/__init__.py @@ -3,6 +3,7 @@ import click from ..version import __version__ #from .freeze import freeze, thaw +from . import diff from . import record @@ -30,11 +31,9 @@ def version(): cls=AliasedGroup, help=f"Composable provenance tracking for scientific data") def main(): - """ - - """ pass #main.add_command(freeze) #main.add_command(thaw) +main.add_command(diff.diff_cli, name='diff') main.add_command(record.record_cli, name='record') main.add_command(version) diff --git a/src/nancy/cli/diff.py b/src/nancy/cli/diff.py new file mode 100644 index 0000000..d836182 --- /dev/null +++ b/src/nancy/cli/diff.py @@ -0,0 +1,84 @@ +import click + +from .. import db, fs, store + +import os +import sys +import warnings + + +def print_diff(ABdiff: fs.FSDiff, version_info='count', indent=2, + indent_level=0, use_color=True): + """Pretty print an FSDiff object""" + if use_color: + try: + from colorama import Fore, Back, Style + except ImportError: + warnings.warn("Could not import colorama library. Color output disabled.") + use_color = False + + changetags = dict( + NEW=Fore.GREEN + 'NEW' + Style.RESET_ALL if use_color else 'NEW', + DEL=Fore.RED + 'DEL' + Style.RESET_ALL if use_color else 'DEL', + MOD=Fore.YELLOW + 'MOD' + Style.RESET_ALL if use_color else 'MOD', + ) + + for l, d in ABdiff.flatten_tree(): + if d.A is None: + assert d.B is not None + print(changetags['NEW'], d.B.unfrozen_perms, + #(indent_level + l) * indent * ' ' + d.B.filename, + d.B.relpath[2:], + ) + elif d.B is None: + print(changetags['DEL'], d.A.unfrozen_perms, (indent_level + l) * indent * ' ' + d.A.filename) + else: + print(changetags['MOD'], d.B.unfrozen_perms, (indent_level + l) * indent * ' ' + d.B.filename) + + +def diff(store, filedir_path, version_info='count'): + """Unwrapped diff command that prints a diff""" + if not os.path.exists(filedir_path): + raise FileNotFoundError(f"Cannot diff non-existent file or directory {filedir_path}") + + # get the diff object + fsdiff = store.diff(filedir_path) + + print_diff(fsdiff, version_info=version_info) + + +@click.command() +@click.argument("path", default='.') +@click.option( + "--version-info", + type=click.Choice(['count', 'details', 'none'], case_sensitive=False), + default='count', + help='How much info to display about file versions.', +) +def diff_cli(path, version_info): + """Detect and describe changes to PATH + + PATH is a path to a file or directory inside an existing nancy store + (a directory containing a file called "nancy.db". If no "nancy.db" is + detected in this directory it is searched for up the filesystem hierarchy. + However, the printed diff information pertains to the given value of + PATH. + """ + storepath = store.find_store(path) + + if storepath is None: + print("Could not find nancy.db in any directory containing " + + str(os.path.realpath(path)), file=sys.stderr) + sys.exit(1) + + print("Paths relative to store path:", storepath) + + # connect to store + s = store.Store(storepath) + + try: + diff(store=s, filedir_path=path) + except FileNotFoundError as e: + print(str(e), file=sys.stderr) + sys.exit(1) + diff --git a/src/nancy/fs.py b/src/nancy/fs.py index b87dcd9..c6a681d 100644 --- a/src/nancy/fs.py +++ b/src/nancy/fs.py @@ -61,32 +61,43 @@ def make_readonly_recursive(path, excluded=[]): remove_write_perms(os.path.join(path, p)) +@dataclass +class FSEntryVersion: + """A hashed file or directory.""" + unfrozen_perms: str # stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' + symlink_target:str # if this is a symlink, this is the (read but not fully + # resolved) target. I.e. this is the "content" of the symlink. + sha256: str + id: int = None # should not be null + version_counter: int = None # incremented whenever a new version of the file is recorded + source_task_id: int = None + + @dataclass class FSEntry: """A hashed file or directory.""" - name: str # with parent directory stripped. None if this is the root + id: int # defaults to None + filename: str # with parent directory stripped. None if this is the root relpath: str # relative to some root directory # children for dirs only: non-recursive; files/dirs at this level only children: List['FSEntry'] filetype: str # regular, symlink, special (block, char, pipe, or socket) - perms: str # e.g. '-rw-rw-r--' - # For directories, sha256 is computed from concatenation of all children - # sorted alphabetically regardless of type using "{name}{hash}" format - sha256: bytes # sha256 of content for regular files, or of target for links + deleted: bool + versions: [FSEntryVersion] = [] + + # these will be filled from the version list automatically + unfrozen_perms: str = None # stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' + symlink_target: str = None # if this is a symlink, this is the (read but not fully + # resolved) target. I.e. this is the "content" of the symlink. + sha256: str = None + latest_version: FSEntryVersion = None def __post_init__(self): - # compute sha256 if it's missing - if self.sha256 is None: - m = hashlib.sha256() - for d in self.subdirs: - assert d.sha256 is not None - m.update(bytes(d.dirname, 'utf-8')) - m.update(d.sha256) - for f in self.files: - assert f.sha256 is not None - m.update(bytes(f.filename, 'utf-8')) - m.update(f.sha256) - self.sha256 = m.digest() + if len(self.versions) > 0: + self.latest_version = self.versions[-1] + self.unfrozen_perms = self.latest_version.unfrozen_perms + self.symlink_target = self.latest_version.symlink_target + self.sha256 = self.latest_version.sha256 @classmethod def from_path(cls, root, relpath='.', filestat=None): @@ -101,11 +112,12 @@ class FSEntry: s = filestat.st_mode children = [] + symlink_target = None if stat.S_ISLNK(s): # Check links first, since it is not exclusive with dir or file checks filetype = 'LNK' - m.update(bytes(os.readlink(path), 'utf-8')) - print(path, "is link") + symlink_target = os.readlink(path) + m.update(bytes(symlink_target, 'utf-8')) elif stat.S_ISDIR(s): filetype = 'DIR' @@ -128,6 +140,11 @@ class FSEntry: for e in direntries ] for c in children: # now hash concatenated sorted hashes + # hash on perms+sha256 to enable recursively detecting perm + # changes without modifying the hashes of individual files, + # which remain content-based for compatibility with + # other tools + m.update(bytes(c.unfrozen_perms, 'utf-8')) m.update(c.sha256) elif stat.S_ISREG(s): filetype = 'REG' @@ -152,53 +169,106 @@ class FSEntry: sha256 = m.digest() return cls( - name=os.path.basename(relpath), + id=None, + filename=os.path.basename(relpath), relpath=relpath, children=children, filetype=filetype, - perms=stat.filemode(filestat.st_mode), - sha256=sha256, + deleted=False, + versions=[ + FSEntryVersion( + unfrozen_perms=stat.filemode(filestat.st_mode), + symlink_target=symlink_target, + sha256=sha256, + ) + ], ) @classmethod def empty_root(cls): """Just a standardized value indicating an empty root directory""" return cls( - name='.', + id=None, + filename='.', relpath='.', children=[], filetype='DIR', perms='----------', sha256=hashlib.sha256().digest(), + deleted=False, ) @classmethod def from_db_index(cls, cursor, root_id=None, root_row=None): """Given id of an entry in filedir, recursively fill this object""" - fields = '(id, filename, filetype, unfrozen_perms, frozen, sha256, symlink_target)' + fields = 'id, filename, filetype, frozen, deleted' if root_row is None: - cursor.execute(f'SELECT {fields} FROM filedir WHERE id={root_id};') + assert root_id is not None + cursor.execute(f'SELECT {fields} FROM filedir WHERE id=?', root_id) root_row = cursor.fetchone() - root_id, name, filetype, unfrozen_perms, frozen, sha256, symlink_target = root_row - # get children - cursor.execute(f'SELECT {fields} FROM filedir WHERE parent={root_id} ORDER BY name;') + root_id, filename, filetype, frozen, deleted = root_row + + cursor.execute(f'SELECT {fields} FROM filedir WHERE parent=? ORDER BY name', root_id) rows = cursor.fetchall() + children = [cls.from_db_index(cursor, root_row=r) for r in rows] + + fields = 'id, version_counter, unfrozen_perms, symlink_target, sha256, source_task_id' + cursor.execute(f'SELECT {fields} FROM filedir_version WHERE filedir=? ORDER BY version', self.id) + matches = cursor.fetchall() + versions = [FSEntryVersion(*row) for row in matches] + return cls( - name=name, + id=root_id, + filename=filename, relpath=relpath, - children=[cls.from_db_index(cursor, root_row=r) for r in rows], + children=children, filetype=filetype, perms=unfrozen_perms, sha256=bytes.fromhex(sha256), + deleted=deleted, + versions=versions, ) - def diff(self, other): - return FSDiff(self, other) + def versions(self, cursor): + """Get a list from the database of FSEntryVersion objects.""" + if self.id is None: + return None + + def flatten_tree(self, level=0): + """Return list of all entries, with level, in pairs""" + pairs = [(level, self)] + for c in sorted(self.children, key=lambda e: e.filename): + pairs.extend(c.flatten_tree(level=level + 1)) + return pairs +def sort_diffs_filename(diffs): + name_ent = {e.filename(): e for e in diffs} + return [name_ent[n] for n in sorted(name_ent.keys())] + + +@dataclass class FSDiff: - def __init__(self, A, B): - """Given two hashed directories, efficiently compute difference. + A: FSEntry # record the comparisons + B: FSEntry # a missing entry indicates new or deleted + modified_children: 'FSDiff' + + @staticmethod + def compare(A, B): + return A.sha256 == B.sha256 and \ + A.perms == B.perms and \ + A.filetype == B.filetype and \ + A.deleted == B.deleted + + def filename(self): + return (self.B.filename if self.A is None else self.A.filename) + + def filetype(self): + return (self.B.filetype if self.A is None else self.A.filetype) + + @classmethod + def compute(cls, A, B): + """Given two hashed directories, recursively compute difference. This assumes the hashes are consistent, so that directories with matching hashes need not be inspected deeply. @@ -208,5 +278,39 @@ class FSDiff: modified (Directory): overlay with modified entries from other new (Directory): overlay with new entries from other """ - self.A = A - self.B = B + if A is None: # new entry + return cls(A, B, [ + cls.compute(None, c) \ + for c in sorted(B.children, key=lambda e: e.filename) + ]) + if B is None: # deleted entry + return cls(A, B, [ + cls.compute(c, N) \ + for c in sorted(A.children, key=lambda e: e.filename) + ]) + + if cls.compare(A, B): + # no need to check descendents + return cls(A, B, []) + + # there are modifications at this level, not just insert/delete + + # list children and find overlaps + Alist = {c.filename: c for c in A.children} + Blist = {c.filename: c for c in B.children} + + allnames = set(Alist.keys() + Blist.keys()) + + modified_children = [cls.compute( + Alist.get(n, None), + Blist.get(n, None), + ) for n in allnames] + + return cls(A, B, modified_children) + + def flatten_tree(self, level=0): + """Return list of all entries, with level, in pairs""" + pairs = [(level, self)] + for c in sorted(self.modified_children, key=lambda d: d.filename()): + pairs.extend(c.flatten_tree(level=level + 1)) + return pairs diff --git a/src/nancy/schema/version0.sql b/src/nancy/schema/version0.sql index c777ab7..38d2e97 100644 --- a/src/nancy/schema/version0.sql +++ b/src/nancy/schema/version0.sql @@ -91,7 +91,7 @@ CREATE TABLE user(id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE store (id INTEGER PRIMARY KEY NOT NULL, machine INTEGER, dbpath TEXT NOT NULL, - uuid TEXT NOT NULL, -- str(uuid.uuid4()) + uuid TEXT NOT NULL, -- UUID generated by str(uuid.uuid4()) imported BOOL, -- is this the store for the current directory? If not, then it was -- imported in order to describe a dependency. @@ -152,7 +152,7 @@ END; END; -- potentially a datum). CREATE TABLE filedir_version (id INTEGER PRIMARY KEY NOT NULL, filedir INTEGER REFERENCES filedir ON UPDATE CASCADE, -- parent filedir entry - version INTEGER NOT NULL, -- incremented whenever a new version of the file is recorded + version_counter INTEGER NOT NULL, -- incremented whenever a new version of the file is recorded -- We record the permissions on each file, in a way that enables reloading -- permissions properly when thawing after a freeze operation. diff --git a/src/nancy/store.py b/src/nancy/store.py index 4ac4247..1527a58 100644 --- a/src/nancy/store.py +++ b/src/nancy/store.py @@ -135,12 +135,49 @@ class Store: """Make store directory read-only (except for nancy.db) and return file list""" fs.make_readonly_recursive(self.path, excluded='./nancy.db') + def filedir_root_index(self, cur=None): + """Get the database id for the table entry in this store having name '.'""" + if cur is None: + cur = self.conn.cursor() + cur.execute('SELECT id FROM filedir WHERE store=0 AND parent is NULL;') + return cur.fetchone() + + def path_to_fsentry(self, path): + """Find a path in the filedir database and return it as an fsentry. + + If the path is not found in the store, None is returned. + """ + cur = self.conn.cursor() + + # get relative path to resolved path + rel = os.path.relpath(os.path.realpath(path), start=os.path.realpath(self.path)) + + # rel tells us how to descend recurively to find the filedir for path + fd_id = self.filedir_root_index(cur) + if fd_id is None: + # Root isn't even inserted into the db yet + return None + + for p in Path(rel).parts: # Path.parts splits a path reliably + # get child with that name + cur.execute( + 'SELECT id, filetype FROM filedir WHERE filename=? AND parent=? LIMIT 1', + (p, fd_id), + ) + row = cur.fetchone() + + if row is None: + return None + + fd_id, filetype = row + if filetype != 'DIR': + return fd_id + return fs.FSEntry.from_db_index(cur, root_id=fd_id) + + def fs_entries(self, shallow=False): """Return recursive structure containing FSEntry objects from db""" - # get the database id for the table entry in this store having name '.' - cur = self.conn.cursor() - cur.execute('SELECT id FROM filedir WHERE store=0 AND parent is NULL;') - root_id = cur.fetchone() + root_id = self.filedir_root_index() if root_id is None: return fs.FSEntry.empty_root() else: @@ -149,15 +186,26 @@ class Store: def program(self, name, message=None): return Program(self, name, message) + def diff(self, filepath): + """ + Diff a file or directory compared to its recorded version + """ + # get info about current files at this location + current = fs.FSEntry.from_path( + self.path, + # send a relative path from self.path to filepath, which is user + # provided + os.path.relpath(os.path.realpath(filepath), os.path.realpath(self.path)), + ) + + recorded = self.path_to_fsentry(filepath) + + return fs.FSDiff.compute(recorded, current) + def record(self, message=None): with self.program('RECORD', message) as p: - # get hashes of current directory (recursive) - current = fs.FSEntry.from_path(self.path) - # extract hashes for all previously-cataloged files/dirs - recorded = self.fs_entries(shallow=True) - - d = current.diff(recorded) + d = self.diff(self.path) # select all dirs @@ -217,7 +265,7 @@ def find_store(path): while True: if os.path.exists(os.path.join(d, "nancy.db")): return d - newd = os.path.dirname(path) + newd = os.path.dirname(d) if newd == d: # indicates this is a filesystem root like C:\ or / return None d = newd