Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WORK-IN-PROGRESS] Introduce the path walk API into Git for Windows #5146

Closed
wants to merge 45 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
6354d7a
path-walk: introduce an object walk by path
derrickstolee Aug 29, 2024
c8e08c3
backfill: add builtin boilerplate
derrickstolee Jun 7, 2024
b05a276
backfill: basic functionality and tests
derrickstolee Sep 1, 2024
e02f7b3
backfill: add --batch-size=<n> option
derrickstolee Sep 1, 2024
4236e4f
backfill: add --sparse option
derrickstolee Sep 1, 2024
31c9b45
path-walk: allow consumer to specify object types
derrickstolee Sep 1, 2024
356abc9
backfill: assume --sparse when sparse-checkout is enabled
derrickstolee Sep 1, 2024
3a421ff
path-walk: allow visiting tags
derrickstolee Sep 9, 2024
b9471b6
survey: stub in new experimental `git-survey` command
jeffhostetler Apr 29, 2024
c4b3490
survey: add command line opts to select references
jeffhostetler Apr 29, 2024
ca37a49
survey: collect the set of requested refs
jeffhostetler Apr 29, 2024
0a20a17
survey: start pretty printing data in table form
derrickstolee Sep 1, 2024
91c4d57
survey: add object count summary
derrickstolee Sep 2, 2024
53632be
revision: create mark_trees_uninteresting_dense()
derrickstolee Sep 6, 2024
c63928e
survey: summarize total sizes by object type
derrickstolee Sep 2, 2024
3e9b671
path-walk: add prune_all_uninteresting option
derrickstolee Sep 4, 2024
af7d53f
survey: show progress during object walk
derrickstolee Sep 2, 2024
d192ae7
pack-objects: add --path-walk option
derrickstolee Sep 5, 2024
5f7e131
survey: add ability to track prioritized lists
derrickstolee Sep 2, 2024
ab0bc08
pack-objects: extract should_attempt_deltas()
derrickstolee Sep 6, 2024
bd8b5b5
survey: add report of "largest" paths
derrickstolee Sep 2, 2024
c6d4832
pack-objects: introduce GIT_TEST_PACK_PATH_WALK
derrickstolee Sep 6, 2024
c2092f0
p5313: add size comparison test
derrickstolee Aug 28, 2024
bbc57f7
repack: add --path-walk option
derrickstolee Sep 5, 2024
32fca07
pack-objects: enable --path-walk via config
derrickstolee Sep 5, 2024
c145b9e
pack-objects: add --full-name-hash option
derrickstolee Sep 7, 2024
72191a0
test-name-hash: add helper to compute name-hash functions
derrickstolee Sep 8, 2024
5039f03
p5314: add a size test for name-hash collisions
derrickstolee Sep 9, 2024
e43582c
scalar: enable path-walk during push via config
derrickstolee Sep 5, 2024
88fee5b
pack-objects: output debug info about deltas
derrickstolee Aug 28, 2024
d17e503
Merge branch 'backfill'
dscho Sep 15, 2024
d7e7283
Merge branch 'survey'
dscho Sep 15, 2024
98a5786
Merge branch 'pack-path-walk'
dscho Sep 15, 2024
9d0690a
Merge branch 'path-walk'
dscho Sep 15, 2024
556335a
fixup! survey: collect the set of requested refs
dscho Sep 15, 2024
69aa8d8
fixup! pack-objects: output debug info about deltas
dscho Sep 15, 2024
5001883
fixup! survey: summarize total sizes by object type
dscho Sep 15, 2024
3ab1bda
fixup! survey: add report of "largest" paths
dscho Sep 15, 2024
84c8a06
fixup! survey: summarize total sizes by object type
dscho Sep 15, 2024
16cd9a3
fixup! pack-objects: output debug info about deltas
dscho Sep 15, 2024
c8f1239
fixup! survey: start pretty printing data in table form
dscho Sep 15, 2024
b5c2265
fixup! survey: add object count summary
dscho Sep 15, 2024
fee8f88
fixup! survey: summarize total sizes by object type
dscho Sep 15, 2024
489ce0c
test-tool: add the `path-walk` subcommand
dscho Sep 17, 2024
9b78d40
fixup! test-tool: add the `path-walk` subcommand
dscho Sep 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -1097,6 +1097,7 @@ LIB_OBJS += parse-options.o
LIB_OBJS += patch-delta.o
LIB_OBJS += patch-ids.o
LIB_OBJS += path.o
LIB_OBJS += path-walk.o
LIB_OBJS += pathspec.o
LIB_OBJS += pkt-line.o
LIB_OBJS += preload-index.o
Expand Down
235 changes: 235 additions & 0 deletions path-walk.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
/*
* path-walk.c: implementation for path-based walks of the object graph.
*/
#include "git-compat-util.h"
#include "path-walk.h"
#include "blob.h"
#include "commit.h"
#include "dir.h"
#include "hashmap.h"
#include "hex.h"
#include "object.h"
#include "oid-array.h"
#include "revision.h"
#include "string-list.h"
#include "strmap.h"
#include "trace2.h"
#include "tree.h"
#include "tree-walk.h"

struct type_and_oid_list
{
enum object_type type;
struct oid_array oids;
};

#define TYPE_AND_OID_LIST_INIT { \
.type = OBJ_NONE, \
.oids = OID_ARRAY_INIT \
}

struct path_walk_context {
/**
* Repeats of data in 'struct path_walk_info' for
* access with fewer characters.
*/
struct repository *repo;
struct rev_info *revs;
struct path_walk_info *info;

/**
* Map a path to a 'struct type_and_oid_list'
* containing the objects discovered at that
* path.
*/
struct strmap paths_to_lists;
dscho marked this conversation as resolved.
Show resolved Hide resolved

/**
* Store the current list of paths in a stack, to
* facilitate depth-first-search without recursion.
*/
struct string_list path_stack;
};

static int add_children(struct path_walk_context *ctx,
const char *base_path,
struct object_id *oid)
{
struct tree_desc desc;
struct name_entry entry;
struct strbuf path = STRBUF_INIT;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For performance reasons, it might make sense to pass in base_path as a strbuf, so that only one scratch buffer is used for the entirety of the path walk (at least in single-threaded mode). But we can do that later.

size_t base_len;
struct tree *tree = lookup_tree(ctx->repo, oid);

if (!tree) {
error(_("failed to walk children of tree %s: not found"),
oid_to_hex(oid));
return -1;
}

strbuf_addstr(&path, base_path);
base_len = path.len;

parse_tree(tree);
init_tree_desc(&desc, &tree->object.oid, tree->buffer, tree->size);
while (tree_entry(&desc, &entry)) {
struct type_and_oid_list *list;
struct object *o;
/* Not actually true, but we will ignore submodules later. */
dscho marked this conversation as resolved.
Show resolved Hide resolved
enum object_type type = S_ISDIR(entry.mode) ? OBJ_TREE : OBJ_BLOB;

/* Skip submodules. */
if (S_ISGITLINK(entry.mode))
continue;

if (type == OBJ_TREE) {
struct tree *child = lookup_tree(ctx->repo, &entry.oid);
o = child ? &child->object : NULL;
} else if (type == OBJ_BLOB) {
struct blob *child = lookup_blob(ctx->repo, &entry.oid);
o = child ? &child->object : NULL;
} else {
/* Wrong type? */
continue;
}

if (!o) /* report error?*/
continue;

/* Skip this object if already seen. */
if (o->flags & SEEN)
continue;
o->flags |= SEEN;

strbuf_setlen(&path, base_len);
strbuf_add(&path, entry.path, entry.pathlen);

/*
* Trees will end with "/" for concatenation and distinction
* from blobs at the same path.
*/
if (type == OBJ_TREE)
strbuf_addch(&path, '/');

if (!(list = strmap_get(&ctx->paths_to_lists, path.buf))) {
CALLOC_ARRAY(list, 1);
list->type = type;
strmap_put(&ctx->paths_to_lists, path.buf, list);
string_list_append(&ctx->path_stack, path.buf);
}
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might want to add this, out of an abundance of caution:

else if (list->type != type)
    warning("both blob and tree objects found at '%s'", path.buf);

oid_array_append(&list->oids, &entry.oid);
}

free_tree_buffer(tree);
strbuf_release(&path);
return 0;
}

/*
* For each path in paths_to_explore, walk the trees another level
* and add any found blobs to the batch (but only if they don't
* exist and haven't been added yet).
*/
static int walk_path(struct path_walk_context *ctx,
const char *path)
{
struct type_and_oid_list *list;
int ret = 0;

list = strmap_get(&ctx->paths_to_lists, path);

/* Evaluate function pointer on this data. */
ret = ctx->info->path_fn(path, &list->oids, list->type,
ctx->info->path_fn_data);

/* Expand data for children. */
if (list->type == OBJ_TREE) {
for (size_t i = 0; i < list->oids.nr; i++) {
ret |= add_children(ctx,
Comment on lines +210 to +211
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Continuing on errors may be the right thing to do always. On the off-chance that a user may want to return early on error, we may want to add a flag for that.

path,
&list->oids.oid[i]);
}
}

oid_array_clear(&list->oids);
strmap_remove(&ctx->paths_to_lists, path, 1);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder whether this is safe. If the same path appears multiple times on the stack, e.g. if two merged branches separately removed the same file, wouldn't we then remove the paths_to_lists item prematurely? This might be intentional, though, as we stop once we're hitting a blob that we'd already seen. Therefore in a history like this:

... - A - B - C
        \   /
          D

where file.txt was present in A, deleted in B and D, and missing from C, we would probably hit the file in B first, walk its history all the way until the commit that added it, then remove the item from paths_to_lists, then re-add it when encountering it in D, see that we handled it in A already, and re-remove it. That might be what we need here, but I have not been able to wrap my head around this enough yet to be confident.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is important to realize that a path appears at most once. The walk goes like this:

  1. Walk all commits that we are going to walk. Initialize the "stack" to include only the root path ("")
  2. Walk all root trees that we are going to walk, initializing the "stack" with the list of all paths that appear as entries of those trees.
  3. Continue walking the stack, knowing that we visit each path only once.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, the crucial part I was missing is that we first walk all commits, build up the huge, giant list for "" (i.e. all the top-level trees). Then we iterate over all of those before going any further. Crucially, no single tree walk happens until every last of the commits has been walked.

For some reason, I got the impression that we're doing a depth-first search where we immediately iterate into the full tree of the first commit, until we exhausted all of the blobs reachable from that root tree, and only then walk to the next commit. But that's not what we do ;-)

We basically walk all the paths in a depth-first manner, but not in alphabetical order because we end up plucking off the paths from the top of the stack (i.e. the "alphabetically last" item seen so far, but that only holds true if the root tree has no child trees, otherwise the order gets messy real quickly). So the order is somewhat jumbled, I guess ;-)

Thank you for explaining this so patiently. I think my confusion might be a good motivation to add a little primer about the order to the commit message of the "path-walk: introduce an object walk by path" commit.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was working on a technical doc for the API but don't have it committed or pushed anywhere. This confirms that it is necessary.

return ret;
}

static void clear_strmap(struct strmap *map)
{
struct hashmap_iter iter;
struct strmap_entry *e;

hashmap_for_each_entry(&map->map, &iter, e, ent) {
struct type_and_oid_list *list = e->value;
oid_array_clear(&list->oids);
}
strmap_clear(map, 1);
strmap_init(map);
}

/**
* Given the configuration of 'info', walk the commits based on 'info->revs' and
* call 'info->path_fn' on each discovered path.
*
* Returns nonzero on an error.
*/
int walk_objects_by_path(struct path_walk_info *info)
{
const char *root_path = "";
int ret = 0;
size_t commits_nr = 0, paths_nr = 0;
struct commit *c;
struct type_and_oid_list *root_tree_list;
struct path_walk_context ctx = {
.repo = info->revs->repo,
.revs = info->revs,
.info = info,
.path_stack = STRING_LIST_INIT_DUP,
.paths_to_lists = STRMAP_INIT
};

trace2_region_enter("path-walk", "commit-walk", info->revs->repo);

/* Insert a single list for the root tree into the paths. */
CALLOC_ARRAY(root_tree_list, 1);
root_tree_list->type = OBJ_TREE;
strmap_put(&ctx.paths_to_lists, root_path, root_tree_list);

if (prepare_revision_walk(info->revs))
die(_("failed to setup revision walk"));

while ((c = get_revision(info->revs))) {
struct object_id *oid = get_commit_tree_oid(c);
struct tree *t = lookup_tree(info->revs->repo, oid);
commits_nr++;

if (t)
oid_array_append(&root_tree_list->oids, oid);
else
warning("could not find tree %s", oid_to_hex(oid));
}

trace2_data_intmax("path-walk", ctx.repo, "commits", commits_nr);
trace2_region_leave("path-walk", "commit-walk", info->revs->repo);

string_list_append(&ctx.path_stack, root_path);

trace2_region_enter("path-walk", "path-walk", info->revs->repo);
while (!ret && ctx.path_stack.nr) {
char *path = ctx.path_stack.items[ctx.path_stack.nr - 1].string;
ctx.path_stack.nr--;
paths_nr++;

ret = walk_path(&ctx, path);

free(path);
}
trace2_data_intmax("path-walk", ctx.repo, "paths", paths_nr);
trace2_region_leave("path-walk", "path-walk", info->revs->repo);

clear_strmap(&ctx.paths_to_lists);
string_list_clear(&ctx.path_stack, 0);
return ret;
}
43 changes: 43 additions & 0 deletions path-walk.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* path-walk.h : Methods and structures for walking the object graph in batches
* by the paths that can reach those objects.
*/
#include "object.h" /* Required for 'enum object_type'. */

struct rev_info;
struct oid_array;

/**
* The type of a function pointer for the method that is called on a list of
* objects reachable at a given path.
*/
typedef int (*path_fn)(const char *path,
struct oid_array *oids,
enum object_type type,
void *data);

struct path_walk_info {
/**
* revs provides the definitions for the commit walk, including
* which commits are UNINTERESTING or not.
*/
struct rev_info *revs;

/**
* The caller wishes to execute custom logic on objects reachable at a
* given path. Every reachable object will be visited exactly once, and
* the first path to see an object wins. This may not be a stable choice.
*/
path_fn path_fn;
void *path_fn_data;
};

#define PATH_WALK_INFO_INIT { 0 }

/**
* Given the configuration of 'info', walk the commits based on 'info->revs' and
* call 'info->path_fn' on each discovered path.
*
* Returns nonzero on an error.
*/
int walk_objects_by_path(struct path_walk_info *info);