Add working FSDiff and new diff command.

This uses colorama so that is added to poetry. Next I'll add the diff
printing to the record command and have the corresponding store method
use the diff as input.
This commit is contained in:
Jacob Hinkle 2022-09-23 20:12:10 -04:00
parent bf141afa5f
commit 183e3ed613
7 changed files with 289 additions and 53 deletions

2
poetry.lock generated
View File

@ -263,7 +263,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-
[metadata] [metadata]
lock-version = "1.1" lock-version = "1.1"
python-versions = "^3.7" python-versions = "^3.7"
content-hash = "49ef6dea324178124adc9639e6329fff4411087ca60907b9661d138d6b6e88aa" content-hash = "c44b5e718866e498815a184650d98fd5e637b07246375d1e9fc4298a0125f0d6"
[metadata.files] [metadata.files]
attrs = [ attrs = [

View File

@ -7,6 +7,7 @@ authors = ["Jacob Hinkle <jacob.hinkle@jhink.org>"]
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.7" python = "^3.7"
click = "^8.1.3" click = "^8.1.3"
colorama = "^0.4.5"
[tool.poetry.dev-dependencies] [tool.poetry.dev-dependencies]
black = "^22.8.0" black = "^22.8.0"

View File

@ -3,6 +3,7 @@ import click
from ..version import __version__ from ..version import __version__
#from .freeze import freeze, thaw #from .freeze import freeze, thaw
from . import diff
from . import record from . import record
@ -30,11 +31,9 @@ def version():
cls=AliasedGroup, cls=AliasedGroup,
help=f"Composable provenance tracking for scientific data") help=f"Composable provenance tracking for scientific data")
def main(): def main():
"""
"""
pass pass
#main.add_command(freeze) #main.add_command(freeze)
#main.add_command(thaw) #main.add_command(thaw)
main.add_command(diff.diff_cli, name='diff')
main.add_command(record.record_cli, name='record') main.add_command(record.record_cli, name='record')
main.add_command(version) main.add_command(version)

84
src/nancy/cli/diff.py Normal file
View File

@ -0,0 +1,84 @@
import click
from .. import db, fs, store
import os
import sys
import warnings
def print_diff(ABdiff: fs.FSDiff, version_info='count', indent=2,
indent_level=0, use_color=True):
"""Pretty print an FSDiff object"""
if use_color:
try:
from colorama import Fore, Back, Style
except ImportError:
warnings.warn("Could not import colorama library. Color output disabled.")
use_color = False
changetags = dict(
NEW=Fore.GREEN + 'NEW' + Style.RESET_ALL if use_color else 'NEW',
DEL=Fore.RED + 'DEL' + Style.RESET_ALL if use_color else 'DEL',
MOD=Fore.YELLOW + 'MOD' + Style.RESET_ALL if use_color else 'MOD',
)
for l, d in ABdiff.flatten_tree():
if d.A is None:
assert d.B is not None
print(changetags['NEW'], d.B.unfrozen_perms,
#(indent_level + l) * indent * ' ' + d.B.filename,
d.B.relpath[2:],
)
elif d.B is None:
print(changetags['DEL'], d.A.unfrozen_perms, (indent_level + l) * indent * ' ' + d.A.filename)
else:
print(changetags['MOD'], d.B.unfrozen_perms, (indent_level + l) * indent * ' ' + d.B.filename)
def diff(store, filedir_path, version_info='count'):
"""Unwrapped diff command that prints a diff"""
if not os.path.exists(filedir_path):
raise FileNotFoundError(f"Cannot diff non-existent file or directory {filedir_path}")
# get the diff object
fsdiff = store.diff(filedir_path)
print_diff(fsdiff, version_info=version_info)
@click.command()
@click.argument("path", default='.')
@click.option(
"--version-info",
type=click.Choice(['count', 'details', 'none'], case_sensitive=False),
default='count',
help='How much info to display about file versions.',
)
def diff_cli(path, version_info):
"""Detect and describe changes to PATH
PATH is a path to a file or directory inside an existing nancy store
(a directory containing a file called "nancy.db". If no "nancy.db" is
detected in this directory it is searched for up the filesystem hierarchy.
However, the printed diff information pertains to the given value of
PATH.
"""
storepath = store.find_store(path)
if storepath is None:
print("Could not find nancy.db in any directory containing "
+ str(os.path.realpath(path)), file=sys.stderr)
sys.exit(1)
print("Paths relative to store path:", storepath)
# connect to store
s = store.Store(storepath)
try:
diff(store=s, filedir_path=path)
except FileNotFoundError as e:
print(str(e), file=sys.stderr)
sys.exit(1)

View File

@ -61,32 +61,43 @@ def make_readonly_recursive(path, excluded=[]):
remove_write_perms(os.path.join(path, p)) remove_write_perms(os.path.join(path, p))
@dataclass
class FSEntryVersion:
"""A hashed file or directory."""
unfrozen_perms: str # stat.filemode(os.stat(path).st_mode): '-rw-rw-r--'
symlink_target:str # if this is a symlink, this is the (read but not fully
# resolved) target. I.e. this is the "content" of the symlink.
sha256: str
id: int = None # should not be null
version_counter: int = None # incremented whenever a new version of the file is recorded
source_task_id: int = None
@dataclass @dataclass
class FSEntry: class FSEntry:
"""A hashed file or directory.""" """A hashed file or directory."""
name: str # with parent directory stripped. None if this is the root id: int # defaults to None
filename: str # with parent directory stripped. None if this is the root
relpath: str # relative to some root directory relpath: str # relative to some root directory
# children for dirs only: non-recursive; files/dirs at this level only # children for dirs only: non-recursive; files/dirs at this level only
children: List['FSEntry'] children: List['FSEntry']
filetype: str # regular, symlink, special (block, char, pipe, or socket) filetype: str # regular, symlink, special (block, char, pipe, or socket)
perms: str # e.g. '-rw-rw-r--' deleted: bool
# For directories, sha256 is computed from concatenation of all children versions: [FSEntryVersion] = []
# sorted alphabetically regardless of type using "{name}{hash}" format
sha256: bytes # sha256 of content for regular files, or of target for links # these will be filled from the version list automatically
unfrozen_perms: str = None # stat.filemode(os.stat(path).st_mode): '-rw-rw-r--'
symlink_target: str = None # if this is a symlink, this is the (read but not fully
# resolved) target. I.e. this is the "content" of the symlink.
sha256: str = None
latest_version: FSEntryVersion = None
def __post_init__(self): def __post_init__(self):
# compute sha256 if it's missing if len(self.versions) > 0:
if self.sha256 is None: self.latest_version = self.versions[-1]
m = hashlib.sha256() self.unfrozen_perms = self.latest_version.unfrozen_perms
for d in self.subdirs: self.symlink_target = self.latest_version.symlink_target
assert d.sha256 is not None self.sha256 = self.latest_version.sha256
m.update(bytes(d.dirname, 'utf-8'))
m.update(d.sha256)
for f in self.files:
assert f.sha256 is not None
m.update(bytes(f.filename, 'utf-8'))
m.update(f.sha256)
self.sha256 = m.digest()
@classmethod @classmethod
def from_path(cls, root, relpath='.', filestat=None): def from_path(cls, root, relpath='.', filestat=None):
@ -101,11 +112,12 @@ class FSEntry:
s = filestat.st_mode s = filestat.st_mode
children = [] children = []
symlink_target = None
if stat.S_ISLNK(s): if stat.S_ISLNK(s):
# Check links first, since it is not exclusive with dir or file checks # Check links first, since it is not exclusive with dir or file checks
filetype = 'LNK' filetype = 'LNK'
m.update(bytes(os.readlink(path), 'utf-8')) symlink_target = os.readlink(path)
print(path, "is link") m.update(bytes(symlink_target, 'utf-8'))
elif stat.S_ISDIR(s): elif stat.S_ISDIR(s):
filetype = 'DIR' filetype = 'DIR'
@ -128,6 +140,11 @@ class FSEntry:
for e in direntries for e in direntries
] ]
for c in children: # now hash concatenated sorted hashes for c in children: # now hash concatenated sorted hashes
# hash on perms+sha256 to enable recursively detecting perm
# changes without modifying the hashes of individual files,
# which remain content-based for compatibility with
# other tools
m.update(bytes(c.unfrozen_perms, 'utf-8'))
m.update(c.sha256) m.update(c.sha256)
elif stat.S_ISREG(s): elif stat.S_ISREG(s):
filetype = 'REG' filetype = 'REG'
@ -152,53 +169,106 @@ class FSEntry:
sha256 = m.digest() sha256 = m.digest()
return cls( return cls(
name=os.path.basename(relpath), id=None,
filename=os.path.basename(relpath),
relpath=relpath, relpath=relpath,
children=children, children=children,
filetype=filetype, filetype=filetype,
perms=stat.filemode(filestat.st_mode), deleted=False,
sha256=sha256, versions=[
FSEntryVersion(
unfrozen_perms=stat.filemode(filestat.st_mode),
symlink_target=symlink_target,
sha256=sha256,
)
],
) )
@classmethod @classmethod
def empty_root(cls): def empty_root(cls):
"""Just a standardized value indicating an empty root directory""" """Just a standardized value indicating an empty root directory"""
return cls( return cls(
name='.', id=None,
filename='.',
relpath='.', relpath='.',
children=[], children=[],
filetype='DIR', filetype='DIR',
perms='----------', perms='----------',
sha256=hashlib.sha256().digest(), sha256=hashlib.sha256().digest(),
deleted=False,
) )
@classmethod @classmethod
def from_db_index(cls, cursor, root_id=None, root_row=None): def from_db_index(cls, cursor, root_id=None, root_row=None):
"""Given id of an entry in filedir, recursively fill this object""" """Given id of an entry in filedir, recursively fill this object"""
fields = '(id, filename, filetype, unfrozen_perms, frozen, sha256, symlink_target)' fields = 'id, filename, filetype, frozen, deleted'
if root_row is None: if root_row is None:
cursor.execute(f'SELECT {fields} FROM filedir WHERE id={root_id};') assert root_id is not None
cursor.execute(f'SELECT {fields} FROM filedir WHERE id=?', root_id)
root_row = cursor.fetchone() root_row = cursor.fetchone()
root_id, name, filetype, unfrozen_perms, frozen, sha256, symlink_target = root_row root_id, filename, filetype, frozen, deleted = root_row
# get children
cursor.execute(f'SELECT {fields} FROM filedir WHERE parent={root_id} ORDER BY name;') cursor.execute(f'SELECT {fields} FROM filedir WHERE parent=? ORDER BY name', root_id)
rows = cursor.fetchall() rows = cursor.fetchall()
children = [cls.from_db_index(cursor, root_row=r) for r in rows]
fields = 'id, version_counter, unfrozen_perms, symlink_target, sha256, source_task_id'
cursor.execute(f'SELECT {fields} FROM filedir_version WHERE filedir=? ORDER BY version', self.id)
matches = cursor.fetchall()
versions = [FSEntryVersion(*row) for row in matches]
return cls( return cls(
name=name, id=root_id,
filename=filename,
relpath=relpath, relpath=relpath,
children=[cls.from_db_index(cursor, root_row=r) for r in rows], children=children,
filetype=filetype, filetype=filetype,
perms=unfrozen_perms, perms=unfrozen_perms,
sha256=bytes.fromhex(sha256), sha256=bytes.fromhex(sha256),
deleted=deleted,
versions=versions,
) )
def diff(self, other): def versions(self, cursor):
return FSDiff(self, other) """Get a list from the database of FSEntryVersion objects."""
if self.id is None:
return None
def flatten_tree(self, level=0):
"""Return list of all entries, with level, in pairs"""
pairs = [(level, self)]
for c in sorted(self.children, key=lambda e: e.filename):
pairs.extend(c.flatten_tree(level=level + 1))
return pairs
def sort_diffs_filename(diffs):
name_ent = {e.filename(): e for e in diffs}
return [name_ent[n] for n in sorted(name_ent.keys())]
@dataclass
class FSDiff: class FSDiff:
def __init__(self, A, B): A: FSEntry # record the comparisons
"""Given two hashed directories, efficiently compute difference. B: FSEntry # a missing entry indicates new or deleted
modified_children: 'FSDiff'
@staticmethod
def compare(A, B):
return A.sha256 == B.sha256 and \
A.perms == B.perms and \
A.filetype == B.filetype and \
A.deleted == B.deleted
def filename(self):
return (self.B.filename if self.A is None else self.A.filename)
def filetype(self):
return (self.B.filetype if self.A is None else self.A.filetype)
@classmethod
def compute(cls, A, B):
"""Given two hashed directories, recursively compute difference.
This assumes the hashes are consistent, so that directories with This assumes the hashes are consistent, so that directories with
matching hashes need not be inspected deeply. matching hashes need not be inspected deeply.
@ -208,5 +278,39 @@ class FSDiff:
modified (Directory): overlay with modified entries from other modified (Directory): overlay with modified entries from other
new (Directory): overlay with new entries from other new (Directory): overlay with new entries from other
""" """
self.A = A if A is None: # new entry
self.B = B return cls(A, B, [
cls.compute(None, c) \
for c in sorted(B.children, key=lambda e: e.filename)
])
if B is None: # deleted entry
return cls(A, B, [
cls.compute(c, N) \
for c in sorted(A.children, key=lambda e: e.filename)
])
if cls.compare(A, B):
# no need to check descendents
return cls(A, B, [])
# there are modifications at this level, not just insert/delete
# list children and find overlaps
Alist = {c.filename: c for c in A.children}
Blist = {c.filename: c for c in B.children}
allnames = set(Alist.keys() + Blist.keys())
modified_children = [cls.compute(
Alist.get(n, None),
Blist.get(n, None),
) for n in allnames]
return cls(A, B, modified_children)
def flatten_tree(self, level=0):
"""Return list of all entries, with level, in pairs"""
pairs = [(level, self)]
for c in sorted(self.modified_children, key=lambda d: d.filename()):
pairs.extend(c.flatten_tree(level=level + 1))
return pairs

View File

@ -91,7 +91,7 @@ CREATE TABLE user(id INTEGER PRIMARY KEY NOT NULL,
CREATE TABLE store (id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE store (id INTEGER PRIMARY KEY NOT NULL,
machine INTEGER, machine INTEGER,
dbpath TEXT NOT NULL, dbpath TEXT NOT NULL,
uuid TEXT NOT NULL, -- str(uuid.uuid4()) uuid TEXT NOT NULL, -- UUID generated by str(uuid.uuid4())
imported BOOL, -- is this the store for the current directory? If not, then it was imported BOOL, -- is this the store for the current directory? If not, then it was
-- imported in order to describe a dependency. -- imported in order to describe a dependency.
@ -152,7 +152,7 @@ END; END;
-- potentially a datum). -- potentially a datum).
CREATE TABLE filedir_version (id INTEGER PRIMARY KEY NOT NULL, CREATE TABLE filedir_version (id INTEGER PRIMARY KEY NOT NULL,
filedir INTEGER REFERENCES filedir ON UPDATE CASCADE, -- parent filedir entry filedir INTEGER REFERENCES filedir ON UPDATE CASCADE, -- parent filedir entry
version INTEGER NOT NULL, -- incremented whenever a new version of the file is recorded version_counter INTEGER NOT NULL, -- incremented whenever a new version of the file is recorded
-- We record the permissions on each file, in a way that enables reloading -- We record the permissions on each file, in a way that enables reloading
-- permissions properly when thawing after a freeze operation. -- permissions properly when thawing after a freeze operation.

View File

@ -135,12 +135,49 @@ class Store:
"""Make store directory read-only (except for nancy.db) and return file list""" """Make store directory read-only (except for nancy.db) and return file list"""
fs.make_readonly_recursive(self.path, excluded='./nancy.db') fs.make_readonly_recursive(self.path, excluded='./nancy.db')
def filedir_root_index(self, cur=None):
"""Get the database id for the table entry in this store having name '.'"""
if cur is None:
cur = self.conn.cursor()
cur.execute('SELECT id FROM filedir WHERE store=0 AND parent is NULL;')
return cur.fetchone()
def path_to_fsentry(self, path):
"""Find a path in the filedir database and return it as an fsentry.
If the path is not found in the store, None is returned.
"""
cur = self.conn.cursor()
# get relative path to resolved path
rel = os.path.relpath(os.path.realpath(path), start=os.path.realpath(self.path))
# rel tells us how to descend recurively to find the filedir for path
fd_id = self.filedir_root_index(cur)
if fd_id is None:
# Root isn't even inserted into the db yet
return None
for p in Path(rel).parts: # Path.parts splits a path reliably
# get child with that name
cur.execute(
'SELECT id, filetype FROM filedir WHERE filename=? AND parent=? LIMIT 1',
(p, fd_id),
)
row = cur.fetchone()
if row is None:
return None
fd_id, filetype = row
if filetype != 'DIR':
return fd_id
return fs.FSEntry.from_db_index(cur, root_id=fd_id)
def fs_entries(self, shallow=False): def fs_entries(self, shallow=False):
"""Return recursive structure containing FSEntry objects from db""" """Return recursive structure containing FSEntry objects from db"""
# get the database id for the table entry in this store having name '.' root_id = self.filedir_root_index()
cur = self.conn.cursor()
cur.execute('SELECT id FROM filedir WHERE store=0 AND parent is NULL;')
root_id = cur.fetchone()
if root_id is None: if root_id is None:
return fs.FSEntry.empty_root() return fs.FSEntry.empty_root()
else: else:
@ -149,15 +186,26 @@ class Store:
def program(self, name, message=None): def program(self, name, message=None):
return Program(self, name, message) return Program(self, name, message)
def diff(self, filepath):
"""
Diff a file or directory compared to its recorded version
"""
# get info about current files at this location
current = fs.FSEntry.from_path(
self.path,
# send a relative path from self.path to filepath, which is user
# provided
os.path.relpath(os.path.realpath(filepath), os.path.realpath(self.path)),
)
recorded = self.path_to_fsentry(filepath)
return fs.FSDiff.compute(recorded, current)
def record(self, message=None): def record(self, message=None):
with self.program('RECORD', message) as p: with self.program('RECORD', message) as p:
# get hashes of current directory (recursive)
current = fs.FSEntry.from_path(self.path)
# extract hashes for all previously-cataloged files/dirs d = self.diff(self.path)
recorded = self.fs_entries(shallow=True)
d = current.diff(recorded)
# select all dirs # select all dirs
@ -217,7 +265,7 @@ def find_store(path):
while True: while True:
if os.path.exists(os.path.join(d, "nancy.db")): if os.path.exists(os.path.join(d, "nancy.db")):
return d return d
newd = os.path.dirname(path) newd = os.path.dirname(d)
if newd == d: # indicates this is a filesystem root like C:\ or / if newd == d: # indicates this is a filesystem root like C:\ or /
return None return None
d = newd d = newd