diff --git a/Cargo.lock b/Cargo.lock index 989913f..02d7e05 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,17 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "ahash" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" +dependencies = [ + "getrandom", + "once_cell", + "version_check", +] + [[package]] name = "aho-corasick" version = "0.7.19" @@ -29,10 +40,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] -name = "cc" -version = "1.0.73" +name = "block-buffer" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" +checksum = "69cce20737498f97b993470a6e536b8523f0af7892a4f928cceb1ac5e52ebe7e" +dependencies = [ + "generic-array", +] [[package]] name = "cfg-if" @@ -77,6 +91,54 @@ dependencies = [ "os_str_bytes", ] +[[package]] +name = "convert_case" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" + +[[package]] +name = "cpufeatures" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d997bd5e24a5928dd43e46dc529867e207907fe0b239c3477d924f7f2ca320" +dependencies = [ + "libc", +] + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "derive_more" +version = "0.99.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321" +dependencies = [ + "convert_case", + "proc-macro2", + "quote", + "rustc_version", + "syn", +] + +[[package]] +name = "digest" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adfbc57365a37acbd2ebf2b64d7e69bb766e2fea813521ed536f5d0520dcf86c" +dependencies = [ + "block-buffer", + "crypto-common", +] + [[package]] name = "env_logger" version = "0.9.1" @@ -90,6 +152,57 @@ dependencies = [ "termcolor", ] +[[package]] +name = "fallible-iterator" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + +[[package]] +name = "generic-array" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4eb1a864a501629691edf6c15a593b7a51eebaa1e8468e9ddc623de7c9b58ec6" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +dependencies = [ + "ahash", +] + +[[package]] +name = "hashlink" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69fe1fcf8b4278d860ad0548329f892a3631fb63f82574df68275f34cdbe0ffa" +dependencies = [ + "hashbrown", +] + [[package]] name = "heck" version = "0.4.0" @@ -117,6 +230,16 @@ version = "0.2.135" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68783febc7782c6c5cb401fbda4de5a9898be1762314da0bb2c10ced61f18b0c" +[[package]] +name = "libsqlite3-sys" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f0455f2c1bc9a7caa792907026e469c1d91761fb0ea37cbb16427c77280cf35" +dependencies = [ + "pkg-config", + "vcpkg", +] + [[package]] name = "log" version = "0.4.17" @@ -137,9 +260,14 @@ name = "nancy" version = "0.1.0" dependencies = [ "clap", + "derive_more", "env_logger", "log", - "sqlite", + "once_cell", + "rusqlite", + "rusqlite_migration", + "sha2", + "uuid", ] [[package]] @@ -220,35 +348,62 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244" [[package]] -name = "sqlite" -version = "0.27.0" +name = "rusqlite" +version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2df8edd55685048550daaaf2be9024182f3523086cc86f7d50c136e55173e8c" +checksum = "01e213bc3ecb39ac32e81e51ebe31fd888a940515173e3a18a35f8c6e896422a" dependencies = [ - "libc", - "sqlite3-sys", + "bitflags", + "fallible-iterator", + "fallible-streaming-iterator", + "hashlink", + "libsqlite3-sys", + "smallvec", + "uuid", ] [[package]] -name = "sqlite3-src" +name = "rusqlite_migration" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eda44233be97aea786691f9f6f7ef230bcf905061f4012e90f4f39e6dcf31163" +dependencies = [ + "log", + "rusqlite", +] + +[[package]] +name = "rustc_version" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1815a7a02c996eb8e5c64f61fcb6fd9b12e593ce265c512c5853b2513635691" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" dependencies = [ - "cc", - "pkg-config", + "semver", ] [[package]] -name = "sqlite3-sys" -version = "0.14.0" +name = "semver" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d47c99824fc55360ba00caf28de0b8a0458369b832e016a64c13af0ad9fbb9ee" +checksum = "e25dfac463d778e353db5be2449d1cce89bd6fd23c9f1ea21310ce6e5a1b29c4" + +[[package]] +name = "sha2" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0" dependencies = [ - "libc", - "sqlite3-src", + "cfg-if", + "cpufeatures", + "digest", ] +[[package]] +name = "smallvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" + [[package]] name = "strsim" version = "0.10.0" @@ -275,18 +430,45 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "typenum" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987" + [[package]] name = "unicode-ident" version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" +[[package]] +name = "uuid" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "feb41e78f93363bb2df8b0e86a2ca30eed7806ea16ea0c790d757cf93f79be83" +dependencies = [ + "getrandom", +] + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + [[package]] name = "winapi" version = "0.3.9" diff --git a/Cargo.toml b/Cargo.toml index 8ac207a..49bc2a0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,14 +11,19 @@ readme = "README.md" [lib] name = "nancy" -path = "src/lib/main.rs" +path = "src/lib.rs" [[bin]] name = "nancy" -path = "src/cli/main.rs" +path = "src/main.rs" [dependencies] clap = { version = "4.0.14", features = ["derive"] } +derive_more = "0.99.17" env_logger = "0.9.1" log = "0.4.17" -sqlite = "0.27.0" +once_cell = "1.15.0" +rusqlite = { version = "0.28.0", features = ["uuid"] } +rusqlite_migration = "1.0.0" +sha2 = "0.10.6" +uuid = { version = "1.2.1", features = ["v4"] } diff --git a/flake.nix b/flake.nix index 7a4d0df..142a869 100644 --- a/flake.nix +++ b/flake.nix @@ -33,7 +33,12 @@ #defaultPackage = naersk-lib.buildPackage ./.; devShell = with pkgs; mkShell { inputsFrom = builtins.attrValues self.packages.${system}; - buildInputs = [ cargo rust-analyzer clippy rustfmt ]; + buildInputs = [ + cargo + rust-analyzer clippy rustfmt # linting + sqlite + openssl pkgconfig # for openssl-sys dep that gets pulled in + ]; }; }); } diff --git a/src/cli/main.rs b/src/cli/main.rs deleted file mode 100644 index 5ae6743..0000000 --- a/src/cli/main.rs +++ /dev/null @@ -1,45 +0,0 @@ -use clap::{Parser, Subcommand}; -use env_logger; -use nancy; - -// Composable provenance tracking for scientific data analysis -#[derive(Parser)] -#[command(author, version, about, long_about = None, arg_required_else_help=true)] -struct Cli { - #[command(subcommand)] - command: Option, -} - -#[derive(Subcommand)] -enum Commands { - /// Record changes to a store directory (or a new store) - Record {}, - /// Check for changes in store and print basic statistics - Status {}, - /// Just say hello - Hello { - #[arg(short, long)] - fromlib: bool, - }, -} - -fn main() { - env_logger::init(); - let cli = Cli::parse(); - - match &cli.command { - Some(Commands::Hello { fromlib }) => { - println!("Hello from nancy (binary)!"); - if *fromlib { - nancy::say_hello(); - } - } - Some(Commands::Record {}) => { - println!("Record not yet implemented"); - } - Some(Commands::Status {}) => { - println!("Status not yet implemented"); - } - None => {} - } -} diff --git a/src/db.rs b/src/db.rs new file mode 100644 index 0000000..cf7b518 --- /dev/null +++ b/src/db.rs @@ -0,0 +1,104 @@ +//! # Database housekeeping methods +//! +//! This module is mostly for managing migrations. This does not encapsulate all database accesses. +//! + +extern crate derive_more; +use derive_more::{Display,Error,From}; + +use once_cell::sync::Lazy; +use rusqlite; +use rusqlite::Connection; +use rusqlite_migration::{Error as RMError, Migrations, M, SchemaVersion}; +use uuid; + +use std::num::{NonZeroUsize}; // for describing schema versions + +// NOTE: apply pragmas outside of migrations, using e.g. +// conn.pragma_update(None, "foreign_keys", &"ON").unwrap(); +static MIGRATIONS: Lazy> = Lazy::new(|| + Migrations::new( + vec![ + M::up(include_str!("migrations/20221024_initial_schema.sql")), + ] +)); +/// The current schema version as it will appear in the SQLite user_version +pub static CURRENT_SCHEMA_VERSION: usize = 1; + +/// Error type for checking schema version of a Connection +#[derive(Debug,Display)] +pub enum SchemaError { + CurrentVersionError(RMError), + NoSchemaVersionSet, + OldSchema(NonZeroUsize), + OutsideSchema(NonZeroUsize), +} +impl From for SchemaError { + fn from(cve: RMError) -> Self { + Self::CurrentVersionError(cve) + } +} + +/// Get schema version in connected database +pub fn schema_version(conn: &Connection) -> Result { + match MIGRATIONS.current_version(conn)? { + SchemaVersion::NoneSet => Err(SchemaError::NoSchemaVersionSet), + SchemaVersion::Inside(v) => // A known version is set. Check if it is current + if v.get() == CURRENT_SCHEMA_VERSION { + Ok(v) + } else { + Err(SchemaError::OldSchema(v)) + }, + SchemaVersion::Outside(v) => Err(SchemaError::OutsideSchema(v)), + } +} + + +/// Initialize the database starting with an empty schema. +/// +/// This function initializes a database to the latest schema, and also generates a new random UUID +/// and associates that with this dataset. The generated uuid is returned. +pub fn init(conn: &mut Connection) -> Result { + MIGRATIONS.to_latest(conn)?; + let u = uuid::Uuid::new_v4(); + conn.execute( + "INSERT INTO local_metadata (key, value) VALUES ('dataset_uuid', ?)", + [u.to_string()], + )?; + Ok(u) +} + + +#[derive(Debug)] +pub struct SchemaUpdateResult { + pub old_version: Option, + pub current_version: NonZeroUsize, + pub updated: bool, +} +/// Ensure that the schema in conn is current. +/// +/// If migrate is true, migrate the schema and return. +/// +/// If migrate is false, throw an Error if schema is not current. +pub fn ensure_schema(conn: &mut Connection, migrate: bool) + -> Result { + let old_version = schema_version(conn)?; + let current_version = unsafe { NonZeroUsize::new_unchecked(CURRENT_SCHEMA_VERSION) }; + Ok(SchemaUpdateResult { + old_version: None, + current_version: current_version, + updated: old_version == current_version, + }) +} + +#[derive(Debug,Display,Error,From)] +pub enum LocalUuidError { + StatementPrepareError(rusqlite::Error), + UuidError(uuid::Error), +} +/// Find the UUID of the dataset representing the directory containing the connected 'nancy.db' +pub fn local_uuid(conn: &Connection) -> Result { + let uuid = conn.prepare("SELECT value FROM local_metadata WHERE key = 'dataset_uuid' LIMIT 1")? + .query_row([], |row| row.get(0))?; + Ok(uuid) +} diff --git a/src/fs.rs b/src/fs.rs new file mode 100644 index 0000000..f76a3b9 --- /dev/null +++ b/src/fs.rs @@ -0,0 +1,15 @@ +//use jwalk; +use log; +use sha2::{Sha256, Digest, digest::FixedOutput}; +use std::path::{Path}; + + +#[derive(Debug)] +pub enum RecordError { + NotImplemented, +} + +pub fn record(path: &Path, message: &String) -> Result<(), RecordError> { + log::info!("Recording path {:?} using message \"{}\"", path, message); + Err(RecordError::NotImplemented) +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..0ce237b --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,23 @@ +use std::io; + +use rusqlite::{Connection}; +use uuid::{Uuid}; + +//pub mod data; +pub mod db; +pub mod fs; +pub mod program; +pub mod timing; + +pub fn print_uuid() -> io::Result { + let db_path = "nancy.db"; + + let conn = Connection::open(db_path).expect("Couldn't open database"); + conn.pragma_update(None, "foreign_keys", &"ON").unwrap(); + + let uuid = db::local_uuid(&conn).expect("Error when extracting local dataset UUID"); + + println!("UUID is {}", uuid); + + Ok(uuid) +} diff --git a/src/lib/main.rs b/src/lib/main.rs deleted file mode 100644 index 73b024a..0000000 --- a/src/lib/main.rs +++ /dev/null @@ -1,47 +0,0 @@ -use log; -use sqlite; - -pub struct Store<'a> { - conn: &'a sqlite::Connection, - uuid: &'a str, -} - -#[derive(Debug)] -pub enum DBError { - NoRowsReturned, - NoneValue, -} - -pub fn get_uuid(conn: &sqlite::Connection) -> Result { - let mut uuid = Err(DBError::NoRowsReturned); - conn.iterate( - " - SELECT value FROM local_metadata WHERE key = 'store_uuid'; - ", - |pairs| { - uuid = match (*pairs)[0].1 { - Some(u) => Ok(u.to_string()), - None => Err(DBError::NoneValue), - }; - - false // only process the first row - }, - ); - uuid -} - -pub fn say_hello() { - log::trace!("enter say_hello()"); - println!("If you see nothing in the next line, set RUST_LOG to info or lower"); - log::info!("Hello from libnancy.rlib!!!"); - - let store_path = "nancy.db"; - let conn = sqlite::open(store_path).unwrap(); - let uuid = get_uuid(&conn).unwrap(); - let this_store = Store { - conn: &conn, - uuid: &uuid, - }; - - println!("Store UUID: {}", this_store.uuid); -} diff --git a/src/lib/store.rs b/src/lib/store.rs deleted file mode 100644 index 4ffbb3a..0000000 --- a/src/lib/store.rs +++ /dev/null @@ -1,3 +0,0 @@ -struct Store{ -}; - diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..be264bc --- /dev/null +++ b/src/main.rs @@ -0,0 +1,106 @@ +use clap::{Parser, Subcommand}; +use env_logger; +use log; +use rusqlite::{Connection}; + +use nancy; + +use std::{process}; +use std::path::{PathBuf}; + +// Composable provenance tracking for scientific data analysis +#[derive(Parser)] +#[command(author, version, about, long_about = None, arg_required_else_help=true)] +struct Cli { + #[command(subcommand)] + command: Option, +} + +#[derive(Subcommand)] +enum Commands { + /// Record changes to files/directories within a dataset (or create a new dataset) + #[command()] + Record { + /// If not within an existing dataset, create one + #[arg(short, long)] + initialize: bool, + /// Use + #[arg(short, long)] + message: String, + #[arg()] + record_paths: Vec, + }, + /// Check for changes in dataset and print basic statistics + Status {}, + /// Just say hello + Hello { + //#[arg(short, long)] + //fromlib: bool, + }, +} + +fn main() { + env_logger::init(); + let args = Cli::parse(); + + match &args.command { + Some(Commands::Hello {}) => { + println!("Hello from nancy (binary)!"); + match nancy::print_uuid() { + Ok(_) => { + println!("OK"); + }, + Err(e) => { + println!("SQLite error: {}", e); + }, + }; + } + Some(Commands::Record { initialize, message, record_paths }) => { + // Determine dataset dir (ds_dir) + // If no paths given, use ["."] for the following steps. + // First, for each requested record_path, find whether it exists within a dataset dir + // Ensure that all paths are in the same dataset dir, or none belong to a dataset dir + // If no dataset dirs found, get top directory containing them all and set this as ds_dir + let mut conn = Connection::open_in_memory().expect("Could not create in-memory db"); + // If initializing: + match nancy::db::init(&mut conn) { + Err(e) => { + log::error!("Encountered error in initializing schema: {:?}", e); + process::exit(1); + } + Ok(dataset_uuid) => { + log::trace!("Init OK"); + log::info!("Dataset UUID is {dataset_uuid}"); + nancy::program::with_program( + &mut conn, + "INIT", + "Initialize dataset", + |prog| { + let okres: Result<(), ()> = Ok(()); + okres + } + ).expect("Empty program should not throw error"); + } + } + match nancy::program::with_program(&mut conn, "RECORD", message, |prog| { + let dataset_path = PathBuf::from("."); + // Note that this may fail, in which case we should roll back only this program + // but keep the dataset initialized. + nancy::fs::record( + &dataset_path, + message, + ) + }) { + Err(e) => { + log::error!("Encountered error in RECORD program: {:?}", e); + process::exit(3); + }, + Ok(()) => (), + } + } + Some(Commands::Status {}) => { + println!("status not yet implemented"); + } + None => {} + } +} diff --git a/src/migrations/20221024_initial_schema.sql b/src/migrations/20221024_initial_schema.sql new file mode 100644 index 0000000..90b8fa8 --- /dev/null +++ b/src/migrations/20221024_initial_schema.sql @@ -0,0 +1,300 @@ +-- Simple key/value table describing _this_ store (not imported ones). +-- In particular, the key "local_store" should be the UUID of the store in this +-- directory. +CREATE TABLE local_metadata( + key TEXT PRIMARY KEY NOT NULL, + value TEXT +); + + +-- This lets us enable or disable triggers. For example, when importing we need +-- to update some tables to account for changing primary keys using ON UPDATE +-- CASCADE. However, some dependent tables like filedir may have a trigger that +-- prevents all updates: e.g. for deleted files. In this case, we need to be +-- able to disable that one specific trigger during imports, then re-enable it. +-- We would do so by updating this table and setting enabled=False/True. +-- +-- Note that entries in this table only need to be inserted when we know we have +-- a need to disable the trigger. Otherwise, this table can be ignored. +CREATE TABLE triggers( + name TEXT PRIMARY KEY NOT NULL, + enabled BOOL +); + + +-- We track host machines, so that we can give better information about file +-- locations. This also allows us to see whether platform-dependent behavior may +-- come into play. +-- A row of this table should describe the aspects of a node that are unlikely +-- to change during regular software updates. For example, the hostname and CPU +-- type are included but not the kernel version. Software that changes due to +-- updates should be included in the "environment" table instead. +CREATE TABLE machine( + sha256 BLOB PRIMARY KEY NOT NULL, + machine_id TEXT, -- platform-dependent unique hardware id + -- Linux: open('/etc/machine-id', 'r').read() (assumes systemd) + -- OSX: `ioreg -rd1 -c IOPlatformExpertDevice | grep IOPlatformUUID | awk '{$print $3}' | tr -d \"` + -- Windows: `reg query HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Cryptography /v MachineGuid` + + hostname TEXT, -- platform.node(): 'lucky' + system TEXT, -- platform.system(): 'Linux' + cpu_type TEXT, -- platform.machine(): 'x86_64' + processor TEXT -- platform.processor(): +); +-- Programs are run by users on machines +CREATE TABLE user( + sha256 BLOB PRIMARY KEY NOT NULL, + username TEXT NOT NULL, -- getpass.getuser() + userid INTEGER, -- os.getuid() + fullname TEXT, -- on Linux/OSX: pwd.getpwuid(os.getuid()).pw_gecos + -- on Windows: see https://stackoverflow.com/questions/21766954/how-to-get-windows-users-full-name-in-python + machine BLOB NOT NULL REFERENCES machine ON UPDATE CASCADE +); +CREATE INDEX FK_user_machine ON user (machine); + +-- Stores and files (and directories) +-- These are the primary objects tracked by nancy. +-- A store is a directory containing a file called nancy.db (e.g. the dir holding this database) +-- In order to reliably merge store entries (like when we have converging +-- dependencies), we need to deduplicate. The machine and path could match, for +-- example if a database is created in one location then copied elsewhere, +-- followed by regenerating the original database. In these cases, we would want +-- to distinguish the new database from the copied one while still being able to +-- equate two simple copies of a db. +-- For this we use a UUID generated randomly +-- by the client. SQLite does not natively support UUIDs but it is stored here +-- as a string. +-- Client code should generate random UUIDs in the RFC 4122 variant layout. +-- https://datatracker.ietf.org/doc/html/rfc4122.html +-- This is possible in Python by simply calling uuid.uuid4() with no arguments +CREATE TABLE store ( + uuid BLOB PRIMARY KEY NOT NULL -- UUID generated by uuid.uuid4() +); + + +-- The filedir table holds all files and directories that are tracked by the +-- store. This table also holds tracked files and directories that have been +-- imported and live outside the current store. +-- We do not support renaming files. +CREATE TABLE filedir ( + sha256 BLOB PRIMARY KEY NOT NULL, + store BLOB NOT NULL REFERENCES store ON UPDATE CASCADE, + name TEXT, -- only a filename, not a path + parent BLOB REFERENCES filedir ON UPDATE CASCADE, + UNIQUE(store, name, parent) +); +CREATE INDEX FK_filedir_store ON filedir (store); +CREATE INDEX FK_filedir_parent ON filedir (parent); +-- Detect cross-store references +CREATE TRIGGER insert_filedir BEFORE INSERT ON filedir +BEGIN SELECT CASE + WHEN NEW.parent IS NOT NULL AND NEW.store != (SELECT store FROM filedir WHERE sha256 = NEW.parent) + THEN RAISE (ABORT, 'Parent resides in different store') +END; END; +CREATE TRIGGER update_filedir BEFORE UPDATE ON filedir +BEGIN + SELECT RAISE (ABORT, 'No updates to filedir allowed'); +END; + + +-- This table holds _versions_ of files and directories. This table holds +-- information that is independent of the location within the filesystem and +-- merely indicates a version of the content in a format that is nearly +-- independent of filetype (though the computation of content hashes is of +-- course dependent on filetype). Each version has a number, and was provided by +-- some program (and potentially a datum). +CREATE TABLE filedir_version ( + uuid BLOB PRIMARY KEY NOT NULL, + filedir BLOB NOT NULL + REFERENCES filedir ON UPDATE CASCADE, -- parent filedir entry + + recorded_time REAL, -- When was this version recorded? + + -- Note that changing filetype (e.g. directory becomes file) or deleting a + -- file are simply just new versions of a filedir. + filetype TEXT, -- One of 'LNK', 'DIR', 'REG', etc. See store.FSEntry.from_path for details + deleted BOOL NOT NULL, -- set True when recording a deleted file + + -- We record the permissions on each file to enable fixing if needed + perms TEXT, -- stat.filemode(os.stat(path).st_mode): '-rw-rw-r--' + + symlink_target TEXT, -- if this is a symlink, this is the (read but not fully resolved) target. i.e. this is the "content" of the symlink. + + -- The following hash can be NULL if the file was deleted. It could also be + -- null if hashing is deferred. Hash deferring would be preferred in cases + -- where a program must use a file or directory for multiple tasks, and each + -- task might mutate the directory. In these cases, each task will add an + -- additional version, but we do not want to hash every version since that + -- would needlessly slow down execution, for very little benefit since we + -- would be recording hashes for versions that may not be final. Instead, we + -- can defer by only hashing files and directories at the end of a program. + sha256 BLOB, + + source_task BLOB REFERENCES task ON UPDATE CASCADE +); +CREATE INDEX FK_filedir_version_filedir ON filedir_version (filedir); +CREATE INDEX FK_filedir_version_source_task ON filedir_version (source_task); +-- Disallow UPDATING filedir_version. Instead, new version should be created. +-- One exception is during importing, in which case we can disable the trigger +INSERT INTO triggers VALUES('update_filedir_version', TRUE); +CREATE TRIGGER update_filedir_version BEFORE UPDATE ON filedir_version +BEGIN SELECT CASE + WHEN (SELECT enabled FROM triggers WHERE name = 'update_filedir_version') + THEN RAISE (ABORT, 'Updating filedir versions is prohibited') +END; END; +-- TODO: check for inserting inconsistent version info re. deleted flag/filetype + + + +-- A computational environment which can execute "programs". Note that the +-- python executable being used, and environment variables are recorded here. +-- Other info is available in the parent "machine" table. +CREATE TABLE environment ( + sha256 BLOB PRIMARY KEY NOT NULL, + + envvars_json TEXT, -- json.dumps(dict(os.environ), sort_keys=True) + python_implementation TEXT, -- platform.python_implementation(): 'cpython' + python_strversion TEXT, -- sys.version: '3.9.7 (default, Sep 16 2021, 13:09:58) \n[GCC 7.5.0]' + python_hexversion INTEGER, -- sys.hexversion: 50923504 + user TEXT REFERENCES user ON UPDATE CASCADE, + timezone TEXT, -- timezone, for interpreting event times + platform_release TEXT, -- platform.release(): '5.15.64' + freedesktop_os_release TEXT, -- requires python 3.10 + -- platform.freedesktop_os_release() as JSON + -- "{'NAME': 'NixOS', 'ID': 'nixos', + -- 'PRETTY_NAME': 'NixOS 22.05 (Quokka)', 'BUG_REPORT_URL': + -- 'https://github.com/NixOS/nixpkgs/issues', 'BUILD_ID': + -- '22.05.20220902.67e4507', 'DOCUMENTATION_URL': + -- 'https://nixos.org/learn.html', 'HOME_URL': 'https://nixos.org/', 'LOGO': + -- 'nix-snowflake', 'SUPPORT_URL': 'https://nixos.org/community.html', + -- 'VERSION': '22.05 (Quokka)', 'VERSION_CODENAME': 'quokka', 'VERSION_ID': + -- '22.05'}" + win32_ver TEXT, -- platform.win32_ver() as JSON + mac_ver TEXT -- platform.mac_ver() as JSON +); + + +-- A "program" is an execution of a single graph, starting from a number of roots +-- Note that if multiple calls to nancy.run() are provided within a single +-- python script, multiple "programs" may be created. Also note that historical +-- programs are automatically imported and merged when possible when loading a +-- "datum" from disk. +CREATE TABLE program ( + uuid BLOB PRIMARY KEY NOT NULL, + name TEXT, -- name of the program, usually written lowercase by calling code e.g. cnn_crossval + -- Names of built-in operations will be shown in upper case: e.g. 'FREEZE' + + -- we use POSIX timestamps for time recording. + -- e.g. datetime.datetime.now().timestamp() + start_time REAL, + end_time REAL, + + process_id INTEGER, -- host PID of python process on host OS + environment BLOB NOT NULL REFERENCES environment ON UPDATE CASCADE, + message TEXT NOT NULL -- user-defined message to help distinguish similar runs +); +CREATE INDEX FK_program_environment ON program (environment); + + +-- We try to track all python packages that impact execution by traversing a +-- copy of sys.modules. This is done once before a "program" and once after in +-- case some calling code winds up calling a previously-unloaded module. +CREATE TABLE package ( + sha256 BLOB PRIMARY KEY NOT NULL, + name TEXT NOT NULL, + version TEXT, + + UNIQUE (name, version) +); +-- A module describes any python module file containing decorated Functions. +-- Modules are tracked since they impact the global scope of function calls. +CREATE TABLE module( + sha256 BLOB PRIMARY KEY NOT NULL, + name TEXT NOT NULL, + code TEXT, -- code doesn't have to be included, but should be used to create sha256 + package BLOB REFERENCES package ON UPDATE CASCADE +); +CREATE INDEX FK_module_package ON module (package); +-- A func just describes a function, without reference to its arguments. +-- It can have inputs and outputs, which are described in the func_inputs and +-- func_outputs children tables. +CREATE TABLE func( + sha256 BLOB PRIMARY KEY NOT NULL, + name TEXT NOT NULL, + module BLOB NOT NULL REFERENCES module ON UPDATE CASCADE +); +CREATE INDEX FK_func_module ON func (module); +CREATE TABLE func_input( + uuid BLOB PRIMARY KEY NOT NULL, + name TEXT NOT NULL, + typename TEXT NOT NULL, + func BLOB NOT NULL REFERENCES func ON UPDATE CASCADE, + + position INTEGER, + posonly BOOL, + kwonly BOOL, + + description TEXT +); +CREATE INDEX FK_func_input_func ON func_input (func); +CREATE TABLE func_output( + uuid BLOB PRIMARY KEY NOT NULL, + name TEXT, + typename TEXT NOT NULL, + func BLOB NOT NULL REFERENCES func ON UPDATE CASCADE, + + position INTEGER NOT NULL, + description TEXT +); +CREATE INDEX FK_func_output_func ON func_output (func); + + +-- Tasks are usually executed calls to Functions: they correspond to a +-- particular set of inputs which are themselves data (see datum table). A task +-- is executed in the context of a "program". Within a program, tasks are +-- typically evaluated in a serial manner. +CREATE TABLE task( + uuid BLOB PRIMARY KEY NOT NULL, + program BLOB NOT NULL REFERENCES program ON UPDATE CASCADE, + + -- func is NULL for some built-in functionality like "RECORD" programs + func BLOB REFERENCES func ON UPDATE CASCADE +); +CREATE INDEX FK_task_program ON task (program); +CREATE INDEX FK_task_func ON task (func); +-- A datum is an object that is computed as the output of a task, given as a +-- literal value in a config file, or loaded from a file. +CREATE TABLE datum( + uuid BLOB PRIMARY KEY NOT NULL, + -- provider_type describes where the datum came from. Choices are: + -- - COMPUTED: output of a decorated Function + -- - IMPORTED: imported output from a prior program + -- - LITERAL: literal value provided in a config file + -- Note that literals are implicitly imported (hash computed) at runtime + provider_type TEXT, + + -- task and task_output are only used when provider_type='COMPUTED' + task BLOB REFERENCES task ON UPDATE CASCADE, + task_output BLOB REFERENCES func_output ON UPDATE CASCADE, + + literal_json TEXT, -- if provider_type='LITERAL', a JSON representation of the value + + typename TEXT NOT NULL -- string representation of the data type +); +CREATE INDEX FK_datum_task ON datum (task); +CREATE INDEX FK_datum_task_output ON datum (task_output); +-- A task_input records the version of a Datum that is passed to a function +CREATE TABLE task_input( + uuid BLOB PRIMARY KEY NOT NULL, + task BLOB NOT NULL REFERENCES task ON UPDATE CASCADE, + -- if this was a python function, reference which input + func_input BLOB REFERENCES func_input ON UPDATE CASCADE, + + datum BLOB NOT NULL REFERENCES datum ON UPDATE CASCADE, + -- Data have versions to facilitate tracking non-const operations. If a datum + -- is passed to a non-const operation, it must increment its internal + -- version + datum_version INTEGER NOT NULL +); +CREATE INDEX FK_task_input_func_input ON task_input (func_input); +CREATE INDEX FK_task_input_datum ON task_input (datum); diff --git a/src/program.rs b/src/program.rs new file mode 100644 index 0000000..9ccbad3 --- /dev/null +++ b/src/program.rs @@ -0,0 +1,90 @@ +use log; +use rusqlite::{Connection,Error as RSQLError}; +extern crate derive_more; +use derive_more::{From}; + +use std::time::{Instant, SystemTime}; + +use crate::timing; + +pub struct Task { + start: Instant, + end: Option, + uuid: String, +} + +pub struct TaskInput { + task: Task, + datum: T, +} + +pub struct Program { +} +impl Program { + //fn new(conn: &Connection) -> Self { + //Program { + //} + //} + //fn new_task(self: &Program) -> Task { + //} +} + +#[derive(Debug)] +pub enum ProgramError { + CreateTransactionFailed(RSQLError), + ProgramFailed(E), + CommitFailed(RSQLError), +} +impl From for ProgramError { + fn from(e: E) -> ProgramError { ProgramError::ProgramFailed(e) } +} + +/// Run a closure as a program, within a database transaction +pub fn with_program Result<(), E>>( + conn: &mut Connection, + name: &str, + message: &str, + f: F, + ) -> Result<(), ProgramError> { + // NOTE: for Errors outside of f(prog), we should not rely on ?, so that we can keep the + // From instance for ProgramError as general as possible. Instead, we should manually check + // errors in the surrounding code in this function and only use ? for f(prog). + + let log_target = &format!("nancy.program ({})", name); + + log::info!(target: log_target, "Running program {} ({})", name, message); + + // start transaction + match conn.transaction() { + Err(e) => Err(ProgramError::CreateTransactionFailed(e)), + Ok(tx) => { + // start timer + log::debug!(target: log_target, "Starting timers"); + let start_st = SystemTime::now(); + let start = Instant::now(); + let start_stamp = timing::persistent_stamp( + Instant::now(), + start, + start_st); + + // Instantiate Program + // (record name and message for new program, get program ID) + let prog = Program {}; + + // run closure with program argument + f(prog)?; // if closure fails, transaction will be rolled back + + // end timer + // Commit scope for RECORD program + let end = Instant::now(); + log::debug!(target: log_target, "Start time: {:?} f64 timestamp={:#?}", start_st, start_stamp); + log::debug!(target: log_target, "Elapsed: {} seconds", (end - start).as_secs_f64()); + // record end time + // commit transaction + match tx.commit() { + Err(e) => Err(ProgramError::CommitFailed(e)), + Ok(_) => Ok(()), + } + }, + } +} diff --git a/src/timing.rs b/src/timing.rs new file mode 100644 index 0000000..263b83b --- /dev/null +++ b/src/timing.rs @@ -0,0 +1,11 @@ +use std::time::{Instant, SystemTime, UNIX_EPOCH}; + +/// This converts an Instant to a double-precision float, which as of October 2022 has roughly 300 +/// ns of resolution. +pub fn persistent_stamp(instant: Instant, base_instant: Instant, base_systime: SystemTime) -> f64 { + base_systime.checked_add(instant - base_instant) + .expect("Bounds error in std::time::SystemTime::checked_add") + .duration_since(UNIX_EPOCH) + .expect("Clock may have gone backward") + .as_secs_f64() +}