From 15791b0ba2644bb86156fd115a011df0eb384f64 Mon Sep 17 00:00:00 2001 From: Tom Fay Date: Tue, 17 Sep 2024 12:24:31 +0100 Subject: [PATCH] more layer hacking --- hack.py | 33 -- src/archive.rs | 101 +----- src/lockfile/build.rs | 217 ++++++++++-- src/lockfile/layers.py | 67 ++++ src/lockfile/nix_closure_graph.py | 566 ++++++++++++++++++++++++++++++ 5 files changed, 830 insertions(+), 154 deletions(-) delete mode 100644 hack.py create mode 100644 src/lockfile/layers.py create mode 100644 src/lockfile/nix_closure_graph.py diff --git a/hack.py b/hack.py deleted file mode 100644 index dd65596..0000000 --- a/hack.py +++ /dev/null @@ -1,33 +0,0 @@ -import pdb -import dnf - -base = dnf.Base() -base.fill_sack() -query = dnf.query.Query(base.sack) -installed = query.installed() - -from collections import defaultdict -graph = defaultdict(set) - -for pkg in installed: - for req in pkg.requires: - providers = installed.filter(provides=req) - if providers: - for provider in providers: - if pkg.name != provider.name and pkg not in graph[provider]: - graph[pkg].add(provider) - -from graphlib import TopologicalSorter, CycleError -cycles = True -ts = TopologicalSorter(graph) - -while cycles: - try: - result = [*TopologicalSorter(graph).static_order()] - cycles = False - except CycleError as e: - # Remove a cycle - graph[e.args[1][1]].remove(e.args[1][0]) - -import pprint -pprint.pprint([p.name for p in result]) diff --git a/src/archive.rs b/src/archive.rs index e27795f..e38db7e 100644 --- a/src/archive.rs +++ b/src/archive.rs @@ -13,110 +13,13 @@ //! You should have received a copy of the GNU General Public License //! along with this program. If not, see . use anyhow::{Context, Result}; -use std::{ - collections::{hash_map::Entry, HashMap}, - io::Write, - os::unix::{ - fs::MetadataExt, - prelude::{FileTypeExt, OsStrExt}, - }, - path::{Path, PathBuf}, -}; -use walkdir::WalkDir; +use std::{io::Write, os::unix::prelude::OsStrExt, path::Path}; // https://mgorny.pl/articles/portability-of-tar-features.html#id25 const PAX_SCHILY_XATTR: &[u8; 13] = b"SCHILY.xattr."; -/// custom implementation of tar-rs's append_dir_all that: -/// - works around https://github.com/alexcrichton/tar-rs/issues/102 so that security capabilities are preserved -/// - emulates tar's `--clamp-mtime` option so that any file/dir/symlink mtimes are no later than a specific value -/// - supports hardlinks -pub(super) fn append_dir_all_with_xattrs( - builder: &mut tar::Builder, - src_path: impl AsRef, - clamp_mtime: i64, -) -> Result<()> { - let src_path = src_path.as_ref(); - // Map (dev, inode) -> path for hardlinks - let mut hardlinks: HashMap<(u64, u64), PathBuf> = HashMap::new(); - - for entry in WalkDir::new(src_path) - .follow_links(false) - .sort_by_file_name() - .into_iter() - { - let entry = entry?; - let meta = entry.metadata()?; - // skip sockets as tar-rs errors when trying to archive them. - // For comparison, umoci also errors, whereas docker skips them - if meta.file_type().is_socket() { - continue; - } - - let rel_path = pathdiff::diff_paths(entry.path(), src_path) - .expect("walkdir returns path inside of search root"); - if rel_path == Path::new("") { - continue; - } - - if entry.file_type().is_symlink() { - if meta.mtime() > clamp_mtime { - // Setting the mtime on a symlink is fiddly with tar-rs, so we use filetime to change - // the mtime before adding the symlink to the tar archive - let mtime = filetime::FileTime::from_unix_time(clamp_mtime, 0); - filetime::set_symlink_file_times(entry.path(), mtime, mtime)?; - } - add_pax_extension_header(entry.path(), builder)?; - builder.append_path_with_name(entry.path(), rel_path)?; - } else if entry.file_type().is_file() || entry.file_type().is_dir() { - add_pax_extension_header(entry.path(), builder)?; - - // If this is a hardlink, add a link header instead of the file - // if this isn't the first time we've seen this inode - if meta.nlink() > 1 { - match hardlinks.entry((meta.dev(), meta.ino())) { - Entry::Occupied(e) => { - // Add link header and continue to next entry - let mut header = tar::Header::new_gnu(); - header.set_metadata(&meta); - if meta.mtime() > clamp_mtime { - header.set_mtime(clamp_mtime as u64); - } - header.set_entry_type(tar::EntryType::Link); - header.set_cksum(); - builder.append_link(&mut header, &rel_path, e.get())?; - continue; - } - Entry::Vacant(e) => { - // This is the first time we've seen this inode - e.insert(rel_path.clone()); - } - } - } - - let mut header = tar::Header::new_gnu(); - header.set_size(meta.len()); - header.set_metadata(&meta); - if meta.mtime() > clamp_mtime { - header.set_mtime(clamp_mtime as u64); - } - if entry.file_type().is_file() { - builder.append_data( - &mut header, - rel_path, - &mut std::fs::File::open(entry.path())?, - )?; - } else { - builder.append_data(&mut header, rel_path, &mut std::io::empty())?; - }; - } - } - - Ok(()) -} - // Convert any extended attributes on the specified path to a tar PAX extension header, and add it to the tar archive -fn add_pax_extension_header( +pub(crate) fn add_pax_extension_header( path: impl AsRef, builder: &mut tar::Builder, ) -> Result<(), anyhow::Error> { diff --git a/src/lockfile/build.rs b/src/lockfile/build.rs index 5d2d21c..3dc5699 100644 --- a/src/lockfile/build.rs +++ b/src/lockfile/build.rs @@ -12,9 +12,11 @@ //! //! You should have received a copy of the GNU General Public License //! along with this program. If not, see . +use std::collections::hash_map::Entry; use std::collections::HashMap; -use std::ffi::OsStr; -use std::path::Path; +use std::ffi::{OsStr, OsString}; +use std::os::unix::fs::{FileTypeExt, MetadataExt}; +use std::path::{Path, PathBuf}; use std::{fs, process::Command}; use anyhow::{bail, Context, Result}; @@ -22,12 +24,15 @@ use chrono::DateTime; use flate2::Compression; use glob::glob; use ocidir::oci_spec::image::MediaType; -use ocidir::{new_empty_manifest, OciDir}; +use ocidir::{new_empty_manifest, GzipLayerWriter, OciDir}; +use pyo3::types::{PyAnyMethods, PyModule, PyTuple}; +use pyo3::{Python, ToPyObject}; use rusqlite::Connection; use tempfile::TempDir; +use walkdir::WalkDir; use super::Lockfile; -use crate::archive::append_dir_all_with_xattrs; +use crate::archive::add_pax_extension_header; use crate::config::Config; use crate::write; use ocidir::cap_std::fs::Dir; @@ -69,16 +74,11 @@ impl Lockfile { } .context("Failed to create installroot")?; - // Create the root filesystem layer - write::ok("Creating", "root filesystem layer")?; - let mut builder = oci_dir.create_layer(Compression::fast().into())?; - builder.follow_symlinks(false); - append_dir_all_with_xattrs(&mut builder, installroot.path(), creation_time.timestamp()) - .context("failed to archive root filesystem")?; - let layer = builder.into_inner()?.complete()?; + // determine most popular packages + let mut layer_builder = + LayerBuilder::new(&oci_dir, installroot.path(), 100, creation_time)?; + layer_builder.create_image_layers()?; - // Create the image configuration blob - write::ok("Writing", "image configuration blob")?; let mut image_config = cfg .image .to_oci_image_configuration(labels, creation_time)?; @@ -86,15 +86,7 @@ impl Lockfile { let mut manifest = new_empty_manifest() .media_type(MediaType::ImageManifest) .build()?; - oci_dir.push_layer_full( - &mut manifest, - &mut image_config, - layer, - Option::>::None, - CREATED_BY, - creation_time, - ); - + layer_builder.finish(&mut manifest, &mut image_config)?; write::ok("Writing", "image manifest and config")?; oci_dir.insert_manifest_and_config( manifest, @@ -181,6 +173,22 @@ impl Lockfile { } } +fn calculate_layer_map(path: &Path, max_layers: usize) -> Result> { + Python::with_gil(|py| { + // Resolve is a compiled in python module for resolving dependencies + let _nix_closure_graph = PyModule::from_code_bound( + py, + include_str!("nix_closure_graph.py"), + "nix_closure_graph", + "nix_closure_graph", + )?; + let layers = PyModule::from_code_bound(py, include_str!("layers.py"), "layers", "resolve")?; + let args = PyTuple::new_bound(py, &[path.to_object(py), max_layers.to_object(py)]); + Ok::<_, anyhow::Error>(layers.getattr("layers")?.call1(args)?.extract()?) + }) + .context("Failed to determine layer graph") +} + fn creation_time() -> Result, anyhow::Error> { let creation_time = if let Ok(sde) = std::env::var("SOURCE_DATE_EPOCH") { let timestamp = sde @@ -199,3 +207,168 @@ fn disable_sqlite_journaling(path: &Path) -> Result<()> { conn.pragma_update(None, "journal_mode", "DELETE")?; Ok(()) } + +struct LayerBuilder<'a> { + package_builders: Vec>>, + path_to_layer_map: HashMap, + catchall: tar::Builder>, + root: PathBuf, + oci_dir: &'a OciDir, + creation_time: DateTime, +} + +impl<'a> LayerBuilder<'a> { + fn new( + oci_dir: &'a OciDir, + installroot: impl Into, + max_layers: usize, + creation_time: DateTime, + ) -> Result { + let root = installroot.into(); + let path_to_layer_map = calculate_layer_map(&root, max_layers)?; + + // Create as many layers as we have + let layer_count = path_to_layer_map + .iter() + .map(|(_, layer_id)| layer_id) + .max() + .unwrap_or(&0); + let mut package_builders = Vec::with_capacity(*layer_count); + for _i in 0..*layer_count { + let mut builder = oci_dir.create_layer(Compression::fast().into())?; + builder.follow_symlinks(false); + package_builders.push(builder); + } + + let mut catchall = oci_dir.create_layer(Compression::fast().into())?; + catchall.follow_symlinks(false); + + Ok(Self { + package_builders, + path_to_layer_map, + catchall, + root, + oci_dir, + creation_time, + }) + } + + /// custom implementation of tar-rs's append_dir_all that: + /// - works around https://github.com/alexcrichton/tar-rs/issues/102 so that security capabilities are preserved + /// - emulates tar's `--clamp-mtime` option so that any file/dir/symlink mtimes are no later than a specific value + /// - supports hardlinks + fn create_image_layers(&mut self) -> Result<()> { + // Map (dev, inode) -> path for hardlinks + let mut hardlinks: HashMap<(u64, u64), PathBuf> = HashMap::new(); + let clamp_mtime = self.creation_time.timestamp(); + + for entry in WalkDir::new(&self.root) + .follow_links(false) + .sort_by_file_name() + .into_iter() + { + let entry = entry?; + let meta = entry.metadata()?; + // skip sockets as tar-rs errors when trying to archive them. + // For comparison, umoci also errors, whereas docker skips them + if meta.file_type().is_socket() { + continue; + } + + let builder = match self.path_to_layer_map.get(entry.path().as_os_str()) { + Some(layer_id) => &mut self.package_builders[*layer_id], + None => &mut self.catchall, + }; + + let rel_path = pathdiff::diff_paths(entry.path(), &self.root) + .expect("walkdir returns path inside of search root"); + if rel_path == Path::new("") { + continue; + } + + if entry.file_type().is_symlink() { + if meta.mtime() > clamp_mtime { + // Setting the mtime on a symlink is fiddly with tar-rs, so we use filetime to change + // the mtime before adding the symlink to the tar archive + let mtime = filetime::FileTime::from_unix_time(clamp_mtime, 0); + filetime::set_symlink_file_times(entry.path(), mtime, mtime)?; + } + add_pax_extension_header(entry.path(), builder)?; + builder.append_path_with_name(entry.path(), rel_path)?; + } else if entry.file_type().is_file() || entry.file_type().is_dir() { + add_pax_extension_header(entry.path(), builder)?; + + // If this is a hardlink, add a link header instead of the file + // if this isn't the first time we've seen this inode + if meta.nlink() > 1 { + match hardlinks.entry((meta.dev(), meta.ino())) { + Entry::Occupied(e) => { + // Add link header and continue to next entry + let mut header = tar::Header::new_gnu(); + header.set_metadata(&meta); + if meta.mtime() > clamp_mtime { + header.set_mtime(clamp_mtime as u64); + } + header.set_entry_type(tar::EntryType::Link); + header.set_cksum(); + builder.append_link(&mut header, &rel_path, e.get())?; + continue; + } + Entry::Vacant(e) => { + // This is the first time we've seen this inode + e.insert(rel_path.clone()); + } + } + } + + let mut header = tar::Header::new_gnu(); + header.set_size(meta.len()); + header.set_metadata(&meta); + if meta.mtime() > clamp_mtime { + header.set_mtime(clamp_mtime as u64); + } + if entry.file_type().is_file() { + builder.append_data( + &mut header, + rel_path, + &mut std::fs::File::open(entry.path())?, + )?; + } else { + builder.append_data(&mut header, rel_path, &mut std::io::empty())?; + }; + } + } + + Ok(()) + } + + fn finish( + self, + manifest: &mut ocidir::oci_spec::image::ImageManifest, + image_config: &mut ocidir::oci_spec::image::ImageConfiguration, + ) -> Result<()> { + for builder in self.package_builders { + let layer = builder.into_inner()?.complete()?; + self.oci_dir.push_layer_full( + manifest, + image_config, + layer, + Option::>::None, + CREATED_BY, + self.creation_time, + ); + } + + let layer = self.catchall.into_inner()?.complete()?; + + self.oci_dir.push_layer_full( + manifest, + image_config, + layer, + Option::>::None, + CREATED_BY, + self.creation_time, + ); + Ok(()) + } +} diff --git a/src/lockfile/layers.py b/src/lockfile/layers.py new file mode 100644 index 0000000..c0dd95a --- /dev/null +++ b/src/lockfile/layers.py @@ -0,0 +1,67 @@ +import dnf +from collections import defaultdict +from nix_closure_graph import make_graph_segment_from_root, graph_popularity_contest + + +def create_package_graph(root): + """ + Create a graph of installed packages and their dependencies. + """ + conf = dnf.conf.Conf() + conf.installroot = root + base = dnf.Base(conf) + base.fill_sack() + query = dnf.query.Query(base.sack) + installed = query.installed() + graph = defaultdict(set) + + for pkg in installed: + for req in pkg.requires: + providers = installed.filter(provides=req) + if providers: + for provider in providers: + if pkg.name != provider.name and pkg not in graph[provider]: + graph[pkg].add(provider) + return graph + + +def remove_cycles(graph): + """ + Repeatedly remove cycles from a graph until it's a DAG. + """ + from graphlib import TopologicalSorter, CycleError + + while True: + try: + _order = [*TopologicalSorter(graph).static_order()] + break + except CycleError as e: + # Remove a cycle + graph[e.args[1][1]].remove(e.args[1][0]) + return graph + + +def layers(root, max_layers): + """ + Return a dictionary mapping files to layer ids. + + Layer ids are integers starting from 0, with 0 being the most popular layer. + + """ + lookup = remove_cycles(create_package_graph(root)) + new_graph = {} + for pkg in lookup.keys(): + if pkg in new_graph: + continue + new_graph[pkg] = make_graph_segment_from_root(pkg, lookup) + + popularity_graph = sorted( + graph_popularity_contest(new_graph).items(), key=lambda x: x[1], reverse=True + ) + layer_dict = {} + for i, (pkg, _) in enumerate(popularity_graph): + if i >= max_layers: + break + for file in pkg.files: + layer_dict[file] = i + return layer_dict diff --git a/src/lockfile/nix_closure_graph.py b/src/lockfile/nix_closure_graph.py new file mode 100644 index 0000000..314cdae --- /dev/null +++ b/src/lockfile/nix_closure_graph.py @@ -0,0 +1,566 @@ +# Taken from https://github.com/NixOS/nixpkgs/blob/69df3ac140f95662ad519f3e453f579409f6e42b/pkgs/build-support/references-by-popularity/closure-graph.py#L408 +# and removed the `main()` invocation. + +# Copyright (c) 2003-2024 Eelco Dolstra and the Nixpkgs/NixOS contributors + +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: + +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +# IMPORTANT: Making changes? +# +# Validate your changes with python3 ./closure-graph.py --test + + +# Using a simple algorithm, convert the references to a path in to a +# sorted list of dependent paths based on how often they're referenced +# and how deep in the tree they live. Equally-"popular" paths are then +# sorted by name. +# +# The existing writeClosure prints the paths in a simple ascii-based +# sorting of the paths. +# +# Sorting the paths by graph improves the chances that the difference +# between two builds appear near the end of the list, instead of near +# the beginning. This makes a difference for Nix builds which export a +# closure for another program to consume, if that program implements its +# own level of binary diffing. +# +# For an example, Docker Images. If each store path is a separate layer +# then Docker Images can be very efficiently transfered between systems, +# and we get very good cache reuse between images built with the same +# version of Nixpkgs. However, since Docker only reliably supports a +# small number of layers (42) it is important to pick the individual +# layers carefully. By storing very popular store paths in the first 40 +# layers, we improve the chances that the next Docker image will share +# many of those layers.* +# +# Given the dependency tree: +# +# A - B - C - D -\ +# \ \ \ \ +# \ \ \ \ +# \ \ - E ---- F +# \- G +# +# Nodes which have multiple references are duplicated: +# +# A - B - C - D - F +# \ \ \ +# \ \ \- E - F +# \ \ +# \ \- E - F +# \ +# \- G +# +# Each leaf node is now replaced by a counter defaulted to 1: +# +# A - B - C - D - (F:1) +# \ \ \ +# \ \ \- E - (F:1) +# \ \ +# \ \- E - (F:1) +# \ +# \- (G:1) +# +# Then each leaf counter is merged with its parent node, replacing the +# parent node with a counter of 1, and each existing counter being +# incremented by 1. That is to say `- D - (F:1)` becomes `- (D:1, F:2)`: +# +# A - B - C - (D:1, F:2) +# \ \ \ +# \ \ \- (E:1, F:2) +# \ \ +# \ \- (E:1, F:2) +# \ +# \- (G:1) +# +# Then each leaf counter is merged with its parent node again, merging +# any counters, then incrementing each: +# +# A - B - (C:1, D:2, E:2, F:5) +# \ \ +# \ \- (E:1, F:2) +# \ +# \- (G:1) +# +# And again: +# +# A - (B:1, C:2, D:3, E:4, F:8) +# \ +# \- (G:1) +# +# And again: +# +# (A:1, B:2, C:3, D:4, E:5, F:9, G:2) +# +# and then paths have the following "popularity": +# +# A 1 +# B 2 +# C 3 +# D 4 +# E 5 +# F 9 +# G 2 +# +# and the popularity contest would result in the paths being printed as: +# +# F +# E +# D +# C +# B +# G +# A +# +# * Note: People who have used a Dockerfile before assume Docker's +# Layers are inherently ordered. However, this is not true -- Docker +# layers are content-addressable and are not explicitly layered until +# they are composed in to an Image. + +import sys +import json +import unittest + +from pprint import pprint +from collections import defaultdict + + +def debug(msg, *args, **kwargs): + if False: + print("DEBUG: {}".format(msg.format(*args, **kwargs)), file=sys.stderr) + + +# Find paths in the original dataset which are never referenced by +# any other paths +def find_roots(closures): + roots = [] + for closure in closures: + path = closure["path"] + if not any_refer_to(path, closures): + roots.append(path) + + return roots + + +class TestFindRoots(unittest.TestCase): + def test_find_roots(self): + self.assertCountEqual( + find_roots( + [ + { + "path": "/nix/store/foo", + "references": ["/nix/store/foo", "/nix/store/bar"], + }, + { + "path": "/nix/store/bar", + "references": ["/nix/store/bar", "/nix/store/tux"], + }, + {"path": "/nix/store/hello", "references": []}, + ] + ), + ["/nix/store/foo", "/nix/store/hello"], + ) + + +def any_refer_to(path, closures): + for closure in closures: + if path != closure["path"]: + if path in closure["references"]: + return True + return False + + +class TestAnyReferTo(unittest.TestCase): + def test_has_references(self): + self.assertTrue( + any_refer_to( + "/nix/store/bar", + [ + {"path": "/nix/store/foo", "references": ["/nix/store/bar"]}, + ], + ), + ) + + def test_no_references(self): + self.assertFalse( + any_refer_to( + "/nix/store/foo", + [ + { + "path": "/nix/store/foo", + "references": ["/nix/store/foo", "/nix/store/bar"], + }, + ], + ), + ) + + +def all_paths(closures): + paths = [] + for closure in closures: + paths.append(closure["path"]) + paths.extend(closure["references"]) + paths.sort() + return list(set(paths)) + + +class TestAllPaths(unittest.TestCase): + def test_returns_all_paths(self): + self.assertCountEqual( + all_paths( + [ + { + "path": "/nix/store/foo", + "references": ["/nix/store/foo", "/nix/store/bar"], + }, + { + "path": "/nix/store/bar", + "references": ["/nix/store/bar", "/nix/store/tux"], + }, + {"path": "/nix/store/hello", "references": []}, + ] + ), + [ + "/nix/store/foo", + "/nix/store/bar", + "/nix/store/hello", + "/nix/store/tux", + ], + ) + + def test_no_references(self): + self.assertFalse( + any_refer_to( + "/nix/store/foo", + [ + { + "path": "/nix/store/foo", + "references": ["/nix/store/foo", "/nix/store/bar"], + }, + ], + ), + ) + + +# Convert: +# +# [ +# { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] }, +# { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] }, +# { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] }, +# { path: /nix/store/tux, references: [ /nix/store/tux ] } +# ] +# +# To: +# { +# /nix/store/foo: [ /nix/store/bar, /nix/store/baz ], +# /nix/store/bar: [ /nix/store/baz ], +# /nix/store/baz: [ /nix/store/tux ] }, +# /nix/store/tux: [ ] +# } +# +# Note that it drops self-references to avoid loops. +def make_lookup(closures): + lookup = {} + + for closure in closures: + # paths often self-refer + nonreferential_paths = [ + ref for ref in closure["references"] if ref != closure["path"] + ] + lookup[closure["path"]] = nonreferential_paths + + return lookup + + +class TestMakeLookup(unittest.TestCase): + def test_returns_lookp(self): + self.assertDictEqual( + make_lookup( + [ + { + "path": "/nix/store/foo", + "references": ["/nix/store/foo", "/nix/store/bar"], + }, + { + "path": "/nix/store/bar", + "references": ["/nix/store/bar", "/nix/store/tux"], + }, + {"path": "/nix/store/hello", "references": []}, + ] + ), + { + "/nix/store/foo": ["/nix/store/bar"], + "/nix/store/bar": ["/nix/store/tux"], + "/nix/store/hello": [], + }, + ) + + +# Convert: +# +# /nix/store/foo with +# { +# /nix/store/foo: [ /nix/store/bar, /nix/store/baz ], +# /nix/store/bar: [ /nix/store/baz ], +# /nix/store/baz: [ /nix/store/tux ] }, +# /nix/store/tux: [ ] +# } +# +# To: +# +# { +# /nix/store/bar: { +# /nix/store/baz: { +# /nix/store/tux: {} +# } +# }, +# /nix/store/baz: { +# /nix/store/tux: {} +# } +# } +subgraphs_cache = {} + + +def make_graph_segment_from_root(root, lookup): + global subgraphs_cache + children = {} + for ref in lookup[root]: + # make_graph_segment_from_root is a pure function, and will + # always return the same result based on a given input. Thus, + # cache computation. + # + # Python's assignment will use a pointer, preventing memory + # bloat for large graphs. + if ref not in subgraphs_cache: + debug("Subgraph Cache miss on {}".format(ref)) + subgraphs_cache[ref] = make_graph_segment_from_root(ref, lookup) + else: + debug("Subgraph Cache hit on {}".format(ref)) + children[ref] = subgraphs_cache[ref] + return children + + +class TestMakeGraphSegmentFromRoot(unittest.TestCase): + def test_returns_graph(self): + self.assertDictEqual( + make_graph_segment_from_root( + "/nix/store/foo", + { + "/nix/store/foo": ["/nix/store/bar"], + "/nix/store/bar": ["/nix/store/tux"], + "/nix/store/tux": [], + "/nix/store/hello": [], + }, + ), + {"/nix/store/bar": {"/nix/store/tux": {}}}, + ) + + def test_returns_graph_tiny(self): + self.assertDictEqual( + make_graph_segment_from_root( + "/nix/store/tux", + { + "/nix/store/foo": ["/nix/store/bar"], + "/nix/store/bar": ["/nix/store/tux"], + "/nix/store/tux": [], + }, + ), + {}, + ) + + +# Convert a graph segment in to a popularity-counted dictionary: +# +# From: +# { +# /nix/store/foo: { +# /nix/store/bar: { +# /nix/store/baz: { +# /nix/store/tux: {} +# } +# } +# /nix/store/baz: { +# /nix/store/tux: {} +# } +# } +# } +# +# to: +# [ +# /nix/store/foo: 1 +# /nix/store/bar: 2 +# /nix/store/baz: 4 +# /nix/store/tux: 6 +# ] +popularity_cache = {} + + +def graph_popularity_contest(full_graph): + global popularity_cache + popularity = defaultdict(int) + for path, subgraph in full_graph.items(): + popularity[path] += 1 + # graph_popularity_contest is a pure function, and will + # always return the same result based on a given input. Thus, + # cache computation. + # + # Python's assignment will use a pointer, preventing memory + # bloat for large graphs. + if path not in popularity_cache: + debug("Popularity Cache miss on {}", path) + popularity_cache[path] = graph_popularity_contest(subgraph) + else: + debug("Popularity Cache hit on {}", path) + + subcontest = popularity_cache[path] + for subpath, subpopularity in subcontest.items(): + debug("Calculating popularity for {}", subpath) + popularity[subpath] += subpopularity + 1 + + return popularity + + +class TestGraphPopularityContest(unittest.TestCase): + def test_counts_popularity(self): + self.assertDictEqual( + graph_popularity_contest( + { + "/nix/store/foo": { + "/nix/store/bar": {"/nix/store/baz": {"/nix/store/tux": {}}}, + "/nix/store/baz": {"/nix/store/tux": {}}, + } + } + ), + { + "/nix/store/foo": 1, + "/nix/store/bar": 2, + "/nix/store/baz": 4, + "/nix/store/tux": 6, + }, + ) + + +# Emit a list of packages by popularity, most first: +# +# From: +# [ +# /nix/store/foo: 1 +# /nix/store/bar: 1 +# /nix/store/baz: 2 +# /nix/store/tux: 2 +# ] +# +# To: +# [ /nix/store/baz /nix/store/tux /nix/store/bar /nix/store/foo ] +def order_by_popularity(paths): + paths_by_popularity = defaultdict(list) + popularities = [] + for path, popularity in paths.items(): + popularities.append(popularity) + paths_by_popularity[popularity].append(path) + + popularities = list(set(popularities)) + popularities.sort() + + flat_ordered = [] + for popularity in popularities: + paths = paths_by_popularity[popularity] + paths.sort(key=package_name) + + flat_ordered.extend(reversed(paths)) + return list(reversed(flat_ordered)) + + +class TestOrderByPopularity(unittest.TestCase): + def test_returns_in_order(self): + self.assertEqual( + order_by_popularity( + { + "/nix/store/foo": 1, + "/nix/store/bar": 1, + "/nix/store/baz": 2, + "/nix/store/tux": 2, + } + ), + ["/nix/store/baz", "/nix/store/tux", "/nix/store/bar", "/nix/store/foo"], + ) + + +def package_name(path): + parts = path.split("-") + start = parts.pop(0) + # don't throw away any data, so the order is always the same. + # even in cases where only the hash at the start has changed. + parts.append(start) + return "-".join(parts) + + +def main(): + filename = sys.argv[1] + key = sys.argv[2] + + debug("Loading from {}", filename) + with open(filename) as f: + data = json.load(f) + + # Data comes in as: + # [ + # { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] }, + # { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] }, + # { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] }, + # { path: /nix/store/tux, references: [ /nix/store/tux ] } + # ] + # + # and we want to get out a list of paths ordered by how universally, + # important they are, ie: tux is referenced by every path, transitively + # so it should be #1 + # + # [ + # /nix/store/tux, + # /nix/store/baz, + # /nix/store/bar, + # /nix/store/foo, + # ] + graph = data[key] + + debug("Finding roots from {}", key) + roots = find_roots(graph) + debug("Making lookup for {}", key) + lookup = make_lookup(graph) + + full_graph = {} + for root in roots: + debug("Making full graph for {}", root) + full_graph[root] = make_graph_segment_from_root(root, lookup) + + debug("Running contest") + contest = graph_popularity_contest(full_graph) + debug("Ordering by popularity") + ordered = order_by_popularity(contest) + debug("Checking for missing paths") + missing = [] + for path in all_paths(graph): + if path not in ordered: + missing.append(path) + + ordered.extend(missing) + print("\n".join(ordered))