Rewrite of edit_distance with edge costs. fix #111 (#137)

* fix edit_distance * some fixes * add tests; little bit of cleaning * make code type stable * use something; initiate cost with a float * Apply formatter * Fix docstring --------- Co-authored-by: Guillaume Dalle <22795598+gdalle@users.noreply.github.com>
JuliaGraphs · Jun 28, 2023 · 327740b · 327740b
1 parent af28a4f
commit 327740b
Show file tree

Hide file tree

Showing 2 changed files with 210 additions and 65 deletions.
diff --git a/src/editdist.jl b/src/editdist.jl
@@ -12,9 +12,16 @@ representing vertex operations:
 
 
 ### Optional Arguments
-- `insert_cost::Function=v->1.0`
-- `delete_cost::Function=u->1.0`
-- `subst_cost::Function=(u,v)->0.5`
+- `vertex_insert_cost::Function=v->0.`
+- `vertex_delete_cost::Function=u->0.`
+- `vertex_subst_cost::Function=(u, v)->0.`
+- `edge_insert_cost::Function=e->1.`
+- `edge_delete_cost::Function=e->1.`
+- `edge_subst_cost::Function=(e1, e2)->0.`
+
+The algorithm will always try to match two edges if it can, so if it is
+preferrable to delete two edges rather than match these, it should be
+reflected in the `edge_subst_cost` function.
 
 By default, the algorithm uses constant operation costs. The
 user can provide classical Minkowski costs computed from vertex
@@ -31,7 +38,7 @@ search in case the default heuristic is not satisfactory.
 - Given two graphs ``|G₁| < |G₂|``, `edit_distance(G₁, G₂)` is faster to
 compute than `edit_distance(G₂, G₁)`. Consider swapping the arguments
 if involved costs are equivalent.
-- The use of simple Minkowski costs can improve performance considerably.
+- The use of a heuristic can improve performance considerably.
 - Exploit vertex attributes when designing operation costs.
 
 ### References
@@ -49,51 +56,163 @@ julia> g1 = SimpleDiGraph([0 1 0 0 0; 0 0 1 0 0; 1 0 0 1 0; 0 0 0 0 1; 0 0 0 1 0
 julia> g2 = SimpleDiGraph([0 1 0; 0 0 1; 1 0 0]);
 
 julia> edit_distance(g1, g2)
-(3.5, Tuple[(1, 2), (2, 1), (3, 0), (4, 3), (5, 0)])
+(3.0, Tuple[(1, 3), (2, 1), (3, 2), (4, 0), (5, 0)])
 ```
 """
 function edit_distance(
     G₁::AbstractGraph,
     G₂::AbstractGraph;
-    insert_cost::Function=v -> 1.0,
-    delete_cost::Function=u -> 1.0,
-    subst_cost::Function=(u, v) -> 0.5,
-    heuristic::Function=DefaultEditHeuristic,
+    vertex_insert_cost=nothing,
+    vertex_delete_cost=nothing,
+    vertex_subst_cost=nothing,
+    edge_insert_cost=nothing,
+    edge_delete_cost=nothing,
+    edge_subst_cost=nothing,
+    heuristic=nothing,
 )
+    if isnothing(vertex_insert_cost) &&
+        isnothing(vertex_delete_cost) &&
+        isnothing(vertex_subst_cost) &&
+        isnothing(edge_insert_cost) &&
+        isnothing(edge_delete_cost) &&
+        isnothing(edge_subst_cost) &&
+        isnothing(heuristic)
+        heuristic = default_edit_heuristic
+    end
+    vertex_insert_cost = something(vertex_insert_cost, v -> 0.0)
+    vertex_delete_cost = something(vertex_delete_cost, v -> 0.0)
+    vertex_subst_cost = something(vertex_subst_cost, (u, v) -> 0.0)
+    edge_insert_cost = something(edge_insert_cost, e -> 1.0)
+    edge_delete_cost = something(edge_delete_cost, e -> 1.0)
+    edge_subst_cost = something(edge_subst_cost, (e1, e2) -> 0.0)
+    heuristic = something(heuristic, (λ, G₁, G₂) -> 0.0)
+    return _edit_distance(
+        G₁::AbstractGraph,
+        G₂::AbstractGraph,
+        vertex_insert_cost,
+        vertex_delete_cost,
+        vertex_subst_cost,
+        edge_insert_cost,
+        edge_delete_cost,
+        edge_subst_cost,
+        heuristic,
+    )
+end
+
+function _edit_distance(
+    G₁::AbstractGraph{T},
+    G₂::AbstractGraph{U},
+    vertex_insert_cost::Function,
+    vertex_delete_cost::Function,
+    vertex_subst_cost::Function,
+    edge_insert_cost::Function,
+    edge_delete_cost::Function,
+    edge_subst_cost::Function,
+    heuristic::Function,
+) where {T<:Integer,U<:Integer}
+    isdirected = is_directed(G₁) || is_directed(G₂)
+
+    # compute the cost on edges due to associate u1 to v1 and u2 to v2
+    # u2 and v2 can eventually be 0
+    function association_cost(u1, u2, v1, v2)
+        cost = 0.0
+        if has_edge(G₁, u1, u2)
+            if has_edge(G₂, v1, v2)
+                cost += edge_subst_cost(Edge(u1, u2), Edge(v1, v2))
+            else
+                cost += edge_delete_cost(Edge(u1, u2))
+            end
+        else
+            if has_edge(G₂, v1, v2)
+                cost += edge_insert_cost(Edge(v1, v2))
+            end
+        end
+        if isdirected && u1 != u2
+            if has_edge(G₁, u2, u1)
+                if has_edge(G₂, v2, v1)
+                    cost += edge_subst_cost(Edge(u2, u1), Edge(v2, v1))
+                else
+                    cost += edge_delete_cost(Edge(u2, u1))
+                end
+            else
+                if has_edge(G₂, v2, v1)
+                    cost += edge_insert_cost(Edge(v2, v1))
+                end
+            end
+        end
+        return cost
+    end
 
     # A* search heuristic
     h(λ) = heuristic(λ, G₁, G₂)
 
     # initialize open set
     OPEN = PriorityQueue{Vector{Tuple},Float64}()
-    for v in 1:nv(G₂)
-        enqueue!(OPEN, [(1, v)], subst_cost(1, v) + h([(1, v)]))
+    for v in vertices(G₂)
+        enqueue!(OPEN, [(T(1), v)], vertex_subst_cost(1, v) + h([(T(1), v)]))
     end
-    enqueue!(OPEN, [(1, 0)], delete_cost(1) + h([(1, 0)]))
+    enqueue!(OPEN, [(T(1), U(0))], vertex_delete_cost(1) + h([(T(1), U(0))]))
 
+    c = 0
     while true
         # minimum (partial) edit path
         λ, cost = peek(OPEN)
+        c += 1
         dequeue!(OPEN)
 
         if is_complete_path(λ, G₁, G₂)
             return cost, λ
         else
-            k, _ = λ[end]
-            vs = setdiff(1:nv(G₂), [v for (u, v) in λ])
+            u1, _ = λ[end]
+            u1 += T(1)
+            vs = setdiff(vertices(G₂), [v for (u, v) in λ])
 
-            if k < nv(G₁) # there are still vertices to process in G₁?
-                for v in vs
-                    λ⁺ = [λ; (k + 1, v)]
-                    enqueue!(OPEN, λ⁺, cost + subst_cost(k + 1, v) + h(λ⁺) - h(λ))
+            if u1 <= nv(G₁) # there are still vertices to process in G₁?
+                # we try every possible assignment of v1
+                for v1 in vs
+                    λ⁺ = [λ; (u1, v1)]
+                    new_cost = cost + vertex_subst_cost(u1, v1) + h(λ⁺) - h(λ)
+                    for (u2, v2) in λ
+                        new_cost += association_cost(u1, u2, v1, v2)
+                    end
+                    new_cost += association_cost(u1, u1, v1, v1) # handle self-loops
+
+                    enqueue!(OPEN, λ⁺, new_cost)
+                end
+                # we try deleting v1
+                λ⁺ = [λ; (u1, U(0))]
+                new_cost = cost + vertex_delete_cost(u1) + h(λ⁺) - h(λ)
+                for u2 in outneighbors(G₁, u1)
+                    # edges deleted later when assigning v2
+                    u2 > u1 && continue
+                    new_cost += edge_delete_cost(Edge(u1, u2))
                 end
-                λ⁺ = [λ; (k + 1, 0)]
-                enqueue!(OPEN, λ⁺, cost + delete_cost(k + 1) + h(λ⁺) - h(λ))
+                if isdirected
+                    for u2 in inneighbors(G₁, u1)
+                        # edges deleted later when assigning v2, and we should not count a self loop twice
+                        u2 >= u1 && continue
+                        new_cost += edge_delete_cost(Edge(u2, u1))
+                    end
+                end
+                enqueue!(OPEN, λ⁺, new_cost)
             else
-                # add remaining vertices of G₂ to the path
-                λ⁺ = [λ; [(0, v) for v in vs]]
-                total_insert_cost = sum(insert_cost, vs)
-                enqueue!(OPEN, λ⁺, cost + total_insert_cost + h(λ⁺) - h(λ))
+                # add remaining vertices of G₂ to the path by deleting them
+                λ⁺ = [λ; [(T(0), v) for v in vs]]
+                new_cost = cost + sum(vertex_insert_cost, vs)
+                for v1 in vs
+                    for v2 in outneighbors(G₂, v1)
+                        (v2 > v1 && v2 in vs) && continue # these edges will be deleted later
+                        new_cost += edge_insert_cost(Edge(v1, v2))
+                    end
+                    if isdirected
+                        for v2 in inneighbors(G₂, v1)
+                            (v2 > v1 && v2 in vs) && continue # these edges will be deleted later
+                            v1 == v2 && continue # we should not count a self loop twice
+                            new_cost += edge_insert_cost(Edge(v2, v1))
+                        end
+                    end
+                end
+                enqueue!(OPEN, λ⁺, new_cost + h(λ⁺) - h(λ))
             end
         end
     end
@@ -112,11 +231,40 @@ function is_complete_path(λ, G₁, G₂)
     return length(us) == nv(G₁) && length(vs) == nv(G₂)
 end
 
-function DefaultEditHeuristic(λ, G₁::AbstractGraph, G₂::AbstractGraph)
-    vs = Set([v for (u, v) in λ])
-    delete!(vs, 0)
+# edit_distance(G₁::AbstractGraph, G₂::AbstractGraph) =
+#         edit_distance(G₁, G₂,
+#             vertex_insert_cost=v -> 0.,
+#             vertex_delete_cost=u -> 0.,
+#             vertex_subst_cost=(u, v) -> 0.,
+#             edge_insert_cost=e -> 1.,
+#             edge_delete_cost=e -> 1.,
+#             edge_subst_cost=(e1, e2) -> 0.,
+#             heuristic=default_edit_heuristic)
 
-    return nv(G₂) - length(vs)
+"""
+compute an upper bound on the number of edges that can still be affected
+"""
+function default_edit_heuristic(λ, G₁::AbstractGraph, G₂::AbstractGraph)
+    us = setdiff(1:nv(G₁), [u for (u, v) in λ])
+    vs = setdiff(1:nv(G₂), [v for (u, v) in λ])
+    total_free_edges_g1 = 0
+    total_free_edges_g2 = 0
+    if !isempty(us)
+        total_free_edges_g1 = sum(u -> outdegree(G₁, u), us)
+    end
+    if !isempty(vs)
+        total_free_edges_g2 = sum(v -> outdegree(G₂, v), vs)
+    end
+    for (u1, v1) in λ
+        (u1 == 0 || v1 == 0) && continue
+        total_free_edges_g1 += count(u2 -> u2 in us, outneighbors(G₁, u1))
+        total_free_edges_g2 += count(v2 -> v2 in vs, outneighbors(G₂, v1))
+    end
+    if !is_directed(G₁) && !is_directed(G₂)
+        total_free_edges_g1 = total_free_edges_g1 / 2
+        total_free_edges_g2 = total_free_edges_g2 / 2
+    end
+    return abs(total_free_edges_g1 - total_free_edges_g2)
 end
 
 #-------------------------

diff --git a/test/edit_distance.jl b/test/edit_distance.jl
@@ -4,51 +4,48 @@
     gquad = random_regular_graph(4, 2; rng=rng)
     gpent = random_regular_graph(5, 2; rng=rng)
 
-    @testset "edit_distance $triangle, $quadrangle, $pentagon" for triangle in
-                                                                   testgraphs(gtri),
-        quadrangle in testgraphs(gquad),
-        pentagon in testgraphs(gpent)
+    g1 = star_graph(4)
+    g2 = cycle_graph(3)
 
-        d, λ = @inferred(
-            edit_distance(triangle, quadrangle, subst_cost=MinkowskiCost(1:3, 1:4))
-        )
-        @test d == 1.0
-        @test λ == Tuple[(1, 1), (2, 2), (3, 3), (0, 4)]
-
-        d, λ = @inferred(
-            edit_distance(quadrangle, triangle, subst_cost=MinkowskiCost(1:4, 1:3))
-        )
-        @test d == 1.0
-        @test λ == Tuple[(1, 1), (2, 2), (3, 3), (4, 0)]
+    vertex_insert_cost = v -> 1.0
+    vertex_delete_cost = v -> 2.0
+    vertex_subst_cost = (u, v) -> 3.0
+    edge_insert_cost = e -> 4.0
+    edge_delete_cost = e -> 5.0
+    edge_subst_cost = (e1, e2) -> 6.0
 
-        d, λ = @inferred(
-            edit_distance(triangle, pentagon, subst_cost=MinkowskiCost(1:3, 1:5))
-        )
+    @testset "undirected edit_distance" for G1 in testgraphs(g1), G2 in testgraphs(g2)
+        d, λ = @inferred(edit_distance(G1, G2))
         @test d == 2.0
-        @test λ == Tuple[(1, 1), (2, 2), (3, 3), (0, 4), (0, 5)]
-
         d, λ = @inferred(
-            edit_distance(pentagon, triangle, subst_cost=MinkowskiCost(1:5, 1:3))
+            edit_distance(
+                G1,
+                G2,
+                vertex_insert_cost=vertex_insert_cost,
+                vertex_delete_cost=vertex_delete_cost,
+                vertex_subst_cost=vertex_subst_cost,
+                edge_insert_cost=edge_insert_cost,
+                edge_delete_cost=edge_delete_cost,
+                edge_subst_cost=edge_subst_cost,
+            )
         )
-        @test d == 2.0
-        @test λ == Tuple[(1, 1), (2, 2), (3, 3), (4, 0), (5, 0)]
+        # 1 vertex deletion, 3 vertex substitution, 1 edge insertio n, 1 edge deletion, 2 edge substitution
+        @test d == 32.0
     end
 
-    @testset "Minkowski cost / bounded Minkowski" begin
-        cost = @inferred(MinkowskiCost(1:3, 1:3))
-        bcost = @inferred(BoundedMinkowskiCost(1:3, 1:3))
-        for i in 1:3
-            @test cost(i, i) == 0.0
-            @test bcost(i, i) == 2 / 3
-        end
+    g1 = DiGraph(4)
+    edges = [(1, 2), (1, 4), (2, 3), (3, 1), (3, 4), (4, 1), (1, 1), (4, 4)]
+    for e in edges
+        add_edge!(g1, e)
+    end
+    g2 = DiGraph(4)
+    edges = [(2, 1), (2, 3), (3, 1), (3, 2), (4, 1), (4, 2), (2, 2), (3, 3)]
+    for e in edges
+        add_edge!(g2, e)
     end
 
-    g1c = complete_graph(4)
-    g2c = complete_graph(4)
-    rem_edge!(g2c, 1, 2)
-    @testset "edit_distance $g1, $g2" for g1 in testgraphs(g1c), g2 in testgraphs(g2c)
-        d, λ = @inferred(edit_distance(g1, g2))
-        @test d == 2.0
-        @test λ == Tuple[(1, 1), (2, 2), (3, 3), (4, 4)]
+    @testset "directed edit_distance" for G1 in testgraphs(g1), G2 in testgraphs(g2)
+        d, λ = @inferred(edit_distance(G1, G2))
+        @test d == 4.0
     end
 end