diff --git a/src/editdist.jl b/src/editdist.jl index a1677fa7f..895ff7a3e 100644 --- a/src/editdist.jl +++ b/src/editdist.jl @@ -12,9 +12,16 @@ representing vertex operations: ### Optional Arguments -- `insert_cost::Function=v->1.0` -- `delete_cost::Function=u->1.0` -- `subst_cost::Function=(u,v)->0.5` +- `vertex_insert_cost::Function=v->0.` +- `vertex_delete_cost::Function=u->0.` +- `vertex_subst_cost::Function=(u, v)->0.` +- `edge_insert_cost::Function=e->1.` +- `edge_delete_cost::Function=e->1.` +- `edge_subst_cost::Function=(e1, e2)->0.` + +The algorithm will always try to match two edges if it can, so if it is +preferrable to delete two edges rather than match these, it should be +reflected in the `edge_subst_cost` function. By default, the algorithm uses constant operation costs. The user can provide classical Minkowski costs computed from vertex @@ -31,7 +38,7 @@ search in case the default heuristic is not satisfactory. - Given two graphs ``|G₁| < |G₂|``, `edit_distance(G₁, G₂)` is faster to compute than `edit_distance(G₂, G₁)`. Consider swapping the arguments if involved costs are equivalent. -- The use of simple Minkowski costs can improve performance considerably. +- The use of a heuristic can improve performance considerably. - Exploit vertex attributes when designing operation costs. ### References @@ -49,51 +56,163 @@ julia> g1 = SimpleDiGraph([0 1 0 0 0; 0 0 1 0 0; 1 0 0 1 0; 0 0 0 0 1; 0 0 0 1 0 julia> g2 = SimpleDiGraph([0 1 0; 0 0 1; 1 0 0]); julia> edit_distance(g1, g2) -(3.5, Tuple[(1, 2), (2, 1), (3, 0), (4, 3), (5, 0)]) +(3.0, Tuple[(1, 3), (2, 1), (3, 2), (4, 0), (5, 0)]) ``` """ function edit_distance( G₁::AbstractGraph, G₂::AbstractGraph; - insert_cost::Function=v -> 1.0, - delete_cost::Function=u -> 1.0, - subst_cost::Function=(u, v) -> 0.5, - heuristic::Function=DefaultEditHeuristic, + vertex_insert_cost=nothing, + vertex_delete_cost=nothing, + vertex_subst_cost=nothing, + edge_insert_cost=nothing, + edge_delete_cost=nothing, + edge_subst_cost=nothing, + heuristic=nothing, ) + if isnothing(vertex_insert_cost) && + isnothing(vertex_delete_cost) && + isnothing(vertex_subst_cost) && + isnothing(edge_insert_cost) && + isnothing(edge_delete_cost) && + isnothing(edge_subst_cost) && + isnothing(heuristic) + heuristic = default_edit_heuristic + end + vertex_insert_cost = something(vertex_insert_cost, v -> 0.0) + vertex_delete_cost = something(vertex_delete_cost, v -> 0.0) + vertex_subst_cost = something(vertex_subst_cost, (u, v) -> 0.0) + edge_insert_cost = something(edge_insert_cost, e -> 1.0) + edge_delete_cost = something(edge_delete_cost, e -> 1.0) + edge_subst_cost = something(edge_subst_cost, (e1, e2) -> 0.0) + heuristic = something(heuristic, (λ, G₁, G₂) -> 0.0) + return _edit_distance( + G₁::AbstractGraph, + G₂::AbstractGraph, + vertex_insert_cost, + vertex_delete_cost, + vertex_subst_cost, + edge_insert_cost, + edge_delete_cost, + edge_subst_cost, + heuristic, + ) +end + +function _edit_distance( + G₁::AbstractGraph{T}, + G₂::AbstractGraph{U}, + vertex_insert_cost::Function, + vertex_delete_cost::Function, + vertex_subst_cost::Function, + edge_insert_cost::Function, + edge_delete_cost::Function, + edge_subst_cost::Function, + heuristic::Function, +) where {T<:Integer,U<:Integer} + isdirected = is_directed(G₁) || is_directed(G₂) + + # compute the cost on edges due to associate u1 to v1 and u2 to v2 + # u2 and v2 can eventually be 0 + function association_cost(u1, u2, v1, v2) + cost = 0.0 + if has_edge(G₁, u1, u2) + if has_edge(G₂, v1, v2) + cost += edge_subst_cost(Edge(u1, u2), Edge(v1, v2)) + else + cost += edge_delete_cost(Edge(u1, u2)) + end + else + if has_edge(G₂, v1, v2) + cost += edge_insert_cost(Edge(v1, v2)) + end + end + if isdirected && u1 != u2 + if has_edge(G₁, u2, u1) + if has_edge(G₂, v2, v1) + cost += edge_subst_cost(Edge(u2, u1), Edge(v2, v1)) + else + cost += edge_delete_cost(Edge(u2, u1)) + end + else + if has_edge(G₂, v2, v1) + cost += edge_insert_cost(Edge(v2, v1)) + end + end + end + return cost + end # A* search heuristic h(λ) = heuristic(λ, G₁, G₂) # initialize open set OPEN = PriorityQueue{Vector{Tuple},Float64}() - for v in 1:nv(G₂) - enqueue!(OPEN, [(1, v)], subst_cost(1, v) + h([(1, v)])) + for v in vertices(G₂) + enqueue!(OPEN, [(T(1), v)], vertex_subst_cost(1, v) + h([(T(1), v)])) end - enqueue!(OPEN, [(1, 0)], delete_cost(1) + h([(1, 0)])) + enqueue!(OPEN, [(T(1), U(0))], vertex_delete_cost(1) + h([(T(1), U(0))])) + c = 0 while true # minimum (partial) edit path λ, cost = peek(OPEN) + c += 1 dequeue!(OPEN) if is_complete_path(λ, G₁, G₂) return cost, λ else - k, _ = λ[end] - vs = setdiff(1:nv(G₂), [v for (u, v) in λ]) + u1, _ = λ[end] + u1 += T(1) + vs = setdiff(vertices(G₂), [v for (u, v) in λ]) - if k < nv(G₁) # there are still vertices to process in G₁? - for v in vs - λ⁺ = [λ; (k + 1, v)] - enqueue!(OPEN, λ⁺, cost + subst_cost(k + 1, v) + h(λ⁺) - h(λ)) + if u1 <= nv(G₁) # there are still vertices to process in G₁? + # we try every possible assignment of v1 + for v1 in vs + λ⁺ = [λ; (u1, v1)] + new_cost = cost + vertex_subst_cost(u1, v1) + h(λ⁺) - h(λ) + for (u2, v2) in λ + new_cost += association_cost(u1, u2, v1, v2) + end + new_cost += association_cost(u1, u1, v1, v1) # handle self-loops + + enqueue!(OPEN, λ⁺, new_cost) + end + # we try deleting v1 + λ⁺ = [λ; (u1, U(0))] + new_cost = cost + vertex_delete_cost(u1) + h(λ⁺) - h(λ) + for u2 in outneighbors(G₁, u1) + # edges deleted later when assigning v2 + u2 > u1 && continue + new_cost += edge_delete_cost(Edge(u1, u2)) end - λ⁺ = [λ; (k + 1, 0)] - enqueue!(OPEN, λ⁺, cost + delete_cost(k + 1) + h(λ⁺) - h(λ)) + if isdirected + for u2 in inneighbors(G₁, u1) + # edges deleted later when assigning v2, and we should not count a self loop twice + u2 >= u1 && continue + new_cost += edge_delete_cost(Edge(u2, u1)) + end + end + enqueue!(OPEN, λ⁺, new_cost) else - # add remaining vertices of G₂ to the path - λ⁺ = [λ; [(0, v) for v in vs]] - total_insert_cost = sum(insert_cost, vs) - enqueue!(OPEN, λ⁺, cost + total_insert_cost + h(λ⁺) - h(λ)) + # add remaining vertices of G₂ to the path by deleting them + λ⁺ = [λ; [(T(0), v) for v in vs]] + new_cost = cost + sum(vertex_insert_cost, vs) + for v1 in vs + for v2 in outneighbors(G₂, v1) + (v2 > v1 && v2 in vs) && continue # these edges will be deleted later + new_cost += edge_insert_cost(Edge(v1, v2)) + end + if isdirected + for v2 in inneighbors(G₂, v1) + (v2 > v1 && v2 in vs) && continue # these edges will be deleted later + v1 == v2 && continue # we should not count a self loop twice + new_cost += edge_insert_cost(Edge(v2, v1)) + end + end + end + enqueue!(OPEN, λ⁺, new_cost + h(λ⁺) - h(λ)) end end end @@ -112,11 +231,40 @@ function is_complete_path(λ, G₁, G₂) return length(us) == nv(G₁) && length(vs) == nv(G₂) end -function DefaultEditHeuristic(λ, G₁::AbstractGraph, G₂::AbstractGraph) - vs = Set([v for (u, v) in λ]) - delete!(vs, 0) +# edit_distance(G₁::AbstractGraph, G₂::AbstractGraph) = +# edit_distance(G₁, G₂, +# vertex_insert_cost=v -> 0., +# vertex_delete_cost=u -> 0., +# vertex_subst_cost=(u, v) -> 0., +# edge_insert_cost=e -> 1., +# edge_delete_cost=e -> 1., +# edge_subst_cost=(e1, e2) -> 0., +# heuristic=default_edit_heuristic) - return nv(G₂) - length(vs) +""" +compute an upper bound on the number of edges that can still be affected +""" +function default_edit_heuristic(λ, G₁::AbstractGraph, G₂::AbstractGraph) + us = setdiff(1:nv(G₁), [u for (u, v) in λ]) + vs = setdiff(1:nv(G₂), [v for (u, v) in λ]) + total_free_edges_g1 = 0 + total_free_edges_g2 = 0 + if !isempty(us) + total_free_edges_g1 = sum(u -> outdegree(G₁, u), us) + end + if !isempty(vs) + total_free_edges_g2 = sum(v -> outdegree(G₂, v), vs) + end + for (u1, v1) in λ + (u1 == 0 || v1 == 0) && continue + total_free_edges_g1 += count(u2 -> u2 in us, outneighbors(G₁, u1)) + total_free_edges_g2 += count(v2 -> v2 in vs, outneighbors(G₂, v1)) + end + if !is_directed(G₁) && !is_directed(G₂) + total_free_edges_g1 = total_free_edges_g1 / 2 + total_free_edges_g2 = total_free_edges_g2 / 2 + end + return abs(total_free_edges_g1 - total_free_edges_g2) end #------------------------- diff --git a/test/edit_distance.jl b/test/edit_distance.jl index 7b5484629..33ca2c2cb 100644 --- a/test/edit_distance.jl +++ b/test/edit_distance.jl @@ -4,51 +4,48 @@ gquad = random_regular_graph(4, 2; rng=rng) gpent = random_regular_graph(5, 2; rng=rng) - @testset "edit_distance $triangle, $quadrangle, $pentagon" for triangle in - testgraphs(gtri), - quadrangle in testgraphs(gquad), - pentagon in testgraphs(gpent) + g1 = star_graph(4) + g2 = cycle_graph(3) - d, λ = @inferred( - edit_distance(triangle, quadrangle, subst_cost=MinkowskiCost(1:3, 1:4)) - ) - @test d == 1.0 - @test λ == Tuple[(1, 1), (2, 2), (3, 3), (0, 4)] - - d, λ = @inferred( - edit_distance(quadrangle, triangle, subst_cost=MinkowskiCost(1:4, 1:3)) - ) - @test d == 1.0 - @test λ == Tuple[(1, 1), (2, 2), (3, 3), (4, 0)] + vertex_insert_cost = v -> 1.0 + vertex_delete_cost = v -> 2.0 + vertex_subst_cost = (u, v) -> 3.0 + edge_insert_cost = e -> 4.0 + edge_delete_cost = e -> 5.0 + edge_subst_cost = (e1, e2) -> 6.0 - d, λ = @inferred( - edit_distance(triangle, pentagon, subst_cost=MinkowskiCost(1:3, 1:5)) - ) + @testset "undirected edit_distance" for G1 in testgraphs(g1), G2 in testgraphs(g2) + d, λ = @inferred(edit_distance(G1, G2)) @test d == 2.0 - @test λ == Tuple[(1, 1), (2, 2), (3, 3), (0, 4), (0, 5)] - d, λ = @inferred( - edit_distance(pentagon, triangle, subst_cost=MinkowskiCost(1:5, 1:3)) + edit_distance( + G1, + G2, + vertex_insert_cost=vertex_insert_cost, + vertex_delete_cost=vertex_delete_cost, + vertex_subst_cost=vertex_subst_cost, + edge_insert_cost=edge_insert_cost, + edge_delete_cost=edge_delete_cost, + edge_subst_cost=edge_subst_cost, + ) ) - @test d == 2.0 - @test λ == Tuple[(1, 1), (2, 2), (3, 3), (4, 0), (5, 0)] + # 1 vertex deletion, 3 vertex substitution, 1 edge insertio n, 1 edge deletion, 2 edge substitution + @test d == 32.0 end - @testset "Minkowski cost / bounded Minkowski" begin - cost = @inferred(MinkowskiCost(1:3, 1:3)) - bcost = @inferred(BoundedMinkowskiCost(1:3, 1:3)) - for i in 1:3 - @test cost(i, i) == 0.0 - @test bcost(i, i) == 2 / 3 - end + g1 = DiGraph(4) + edges = [(1, 2), (1, 4), (2, 3), (3, 1), (3, 4), (4, 1), (1, 1), (4, 4)] + for e in edges + add_edge!(g1, e) + end + g2 = DiGraph(4) + edges = [(2, 1), (2, 3), (3, 1), (3, 2), (4, 1), (4, 2), (2, 2), (3, 3)] + for e in edges + add_edge!(g2, e) end - g1c = complete_graph(4) - g2c = complete_graph(4) - rem_edge!(g2c, 1, 2) - @testset "edit_distance $g1, $g2" for g1 in testgraphs(g1c), g2 in testgraphs(g2c) - d, λ = @inferred(edit_distance(g1, g2)) - @test d == 2.0 - @test λ == Tuple[(1, 1), (2, 2), (3, 3), (4, 4)] + @testset "directed edit_distance" for G1 in testgraphs(g1), G2 in testgraphs(g2) + d, λ = @inferred(edit_distance(G1, G2)) + @test d == 4.0 end end