Add more Store functionality

This commit is contained in:
Jacob Hinkle 2022-09-20 12:09:09 -04:00
parent 3a99a14a0a
commit c30ca6a083
3 changed files with 238 additions and 15 deletions

View File

@ -19,10 +19,20 @@ class AliasedGroup(click.Group):
ctx.fail("Too many matches: %s" % ", ".join(sorted(matches)))
@click.group(f"nancy v{__version__}", cls=AliasedGroup)
@click.command()
def version():
"""Print version information."""
print(f"nancy v{__version__}")
@click.group(
cls=AliasedGroup,
help=f"Composable provenance tracking for scientific data")
def main():
"""
"""
pass
main.add_command(freeze)
main.add_command(thaw)
main.add_command(version)

View File

@ -2,27 +2,43 @@ import click
from .. import store
import os
@click.command()
@click.argument("directory")
def freeze(directory):
"""
Initialize tracking in a directory or freeze a tracked directory.
If DIRECTORY is not already part of an existing nancy store, then a new
'nancy.db' file is created in that directory. On the other hand, if the
directory is part of an existing store, it will be re-frozen and versions
of any files changes since thawing will be incremented.
"""
if not os.path.isdir(directory):
raise ValueError(f"Cannot freeze non-existent directory {directory}")
existing_store = db.find_store(directory)
existing_store = store.find_store(directory)
if existing_store is None: # this is a new store
s = store.init(directory)
s = store.Store.init(directory)
else: # this is an existing store
store.connect(directory)
s = store.Store(directory)
# remove write permissions on all files except nancy.db
# compute checksums on all files, update versions in nancy.db as appropriate
# remove write permissions on nancy.db
s.freeze()
@click.command()
@click.argument("files", nargs=-1) # , help="Files or directories to thaw.")
def thaw(files):
"""
Enable manual alteration of files within a tracked directory.
This command is meant to be used in conjunction with the 'freeze'
subcommand. After thawing, changes may be made in the current directory,
after which `nancy freeze` should be run changes may be made in the current
directory, after which `nancy freeze` should be run. At that point, changes
will be recorded: new files will be detected and modified files will have
their version numbers incremented.
"""
pass

View File

@ -1,6 +1,206 @@
"""Utilities for creating new stores and linking between them."""
from . import db
import importlib
import json
import os
from pathlib import Path
import sqlite3
import stat
def remove_write_perms(path):
"""Remove write permissions for all users while preserving other perms"""
s = os.stat(path)
orig_perm_string = stat.filemode(s)
os.chmod(
path,
s.st_mode ^ stat.S_IWUSR ^ stat.S_IWGRP ^ stat.S_IWOTH,
follow_symlinks=False,
)
return orig_perm_string
class Store:
"""Describes a data directory, holds active connection to nancy.db"""
def __init__(self, directory=None, conn=None):
"""
Arguments:
directory (str): Location of existing store directory. If omitted
or None, initialize a store in memory, with no associated
directory.
"""
if directory is None:
self.path = None
self.db_path = ':memory:'
else:
self.path = Path(directory)
self.db_path = self.path / "nancy.db"
if conn is None:
self.connect()
else:
self.conn = conn
def copy(self, store_path):
"""Copy this store to a new store path"""
dst_db_path = os.path.join(store_path, "nancy.db")
dst_conn = sqlite3.connect(dst_db_path)
self.conn.backup(dst_conn)
dst_conn.close
return self.__class__(store_path)
def connect(self):
self.conn = sqlite3.connect(self.db_path)
self.conn.cursor().execute("PRAGMA foreign_keys = ON;")
return self.conn
@classmethod
def init(cls, directory=None):
if directory is None: # initialize an in-memory store
db_path = ':memory:'
else:
if not os.path.isdir(directory):
raise FileNotFoundError(
f"Directory {directory} must exist before initializing a store there.",
)
db_path = os.path.join(directory, 'nancy.db')
if os.path.isfile(db_path):
raise FileExistsError(
f"File {db_path} exists. Refusing to re-initialize",
)
# initialize a database in the target directory
schema = importlib.resources.open_text("nancy", "schema.sql").read()
conn = sqlite3.connect(db_path)
conn.cursor().executescript(schema)
new_store = cls(directory, conn)
new_store.record_machine_description()
return new_store
def record_machine_description(self):
"""Record machine-specific information"""
import platform, time
cur = self.conn.cursor()
fdor = ''
try:
fdor = json.dumps(platform.freedesktop_os_release())
except AttributeError:
# freedesktop_os_release only available for python >= 3.10
fdor = ''
machine_info = (
platform.node(),
platform.processor(),
platform.system(),
platform.release(),
platform.machine(),
time.tzname[time.daylight],
fdor,
json.dumps(platform.win32_ver()),
json.dumps(platform.mac_ver()),
)
cur.execute('INSERT INTO machine VALUES(NULL,?,?,?,?,?,?,?,?,?)', machine_info)
self.conn.commit()
def make_readonly(self):
"""Make store directory read-only (except for nancy.db) and return file list"""
dirpaths = {}
regfilepaths = {}
symlinkpaths = {}
irregfilepaths = {} # any file that's not regular or symlink. e.g. inodes or sockets
# traversing bottom-up makes it easier to freeze perms on directories
for root, dirs, files in os.walk(self.path, topdown=False):
for f in files:
p = os.path.join(root, f)
if p == './nancy.db':
continue
orig_perms = remove_write_perms(os.path.join(self.path, p))
if not os.path.isfile(p):
# not a link or regular file
irregfilepaths[p] = orig_perms
if os.path.islink(p):
symlinkpaths[p] = orig_perms
else:
regfilepaths[p] = orig_perms
for d in dirs:
p = os.path.join(root, d)
orig_perms = remove_write_perms(os.path.join(self.path, p))
dirpaths[p] = orig_perms
return dirpaths, regfilepaths, symlinkpaths, irregfilepaths
def file_hashes(self, local_only=True):
"""Get recorded hashes of all files currently tracked in this store"""
def freeze(self):
# Remove write permissions on all files except nancy.db
# This also catalogs all files, directories, and symlinks
# Note that ./nancy.db is NOT listed in files and is not altered
dirs, files, symlinks, irreg = self.make_readonly()
if len(irreg) > 0:
raise Exception(
"Refusing to freeze directory containing irregular files: "
", ".join(irreg),
)
for p, perms in symlinks.items():
target = os.path.readlink(p) # gives possibly relative path to tgt
resolved = os.path.realpath(p)
if not resolved.startswith(os.path.abspath(self.path)):
raise Exception(
f"Freezing {p} failed since it points to location "
f"{target} which is outside store path ({self.path})",
)
# compute checksums on all files
# detect new and deleted files, and those that have changed type
# update versions in nancy.db as appropriate
# remove write permissions on nancy.db
remove_write_perms(os.path.join(self.path, 'nancy.db'))
#@contextmanager
def run(
self,
name=None,
message=None,
):
"""
Create a context manager that encapsulates a procedure that can save files.
Note that this does NOT spawn any new OS processes or threads.
Example:
s = nancy.store.init(target_directory)
with s.run("sum_dataframe") as f:
x = PandasDataframe()
y = Sum(x)
f.save('stats/xsum.csv', y)
"""
pass
class StoreFile:
"""Describes a file that is recorded in the store."""
def __init__(self, store, rel_path):
self.store = store
self.rel_path = rel_path
def save(self):
# call the appropriate save method
pass
def find_store(path):
@ -14,7 +214,4 @@ def find_store(path):
newd = os.path.dirname(path)
if newd == d: # indicates this is a filesystem root like C:\ or /
return None
def connect(path):
"""Given directory, connect to its store"""
d = newd