Skip to content

Commit

Permalink
Rewrite of edit_distance with edge costs. fix #111 (#137)
Browse files Browse the repository at this point in the history
* fix edit_distance

* some fixes

* add tests; little bit of cleaning

* make code type stable

* use something; initiate cost with a float

* Apply formatter

* Fix docstring

---------

Co-authored-by: Guillaume Dalle <22795598+gdalle@users.noreply.github.com>
  • Loading branch information
etiennedeg and gdalle committed Jun 28, 2023
1 parent af28a4f commit 327740b
Show file tree
Hide file tree
Showing 2 changed files with 210 additions and 65 deletions.
204 changes: 176 additions & 28 deletions src/editdist.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,16 @@ representing vertex operations:
### Optional Arguments
- `insert_cost::Function=v->1.0`
- `delete_cost::Function=u->1.0`
- `subst_cost::Function=(u,v)->0.5`
- `vertex_insert_cost::Function=v->0.`
- `vertex_delete_cost::Function=u->0.`
- `vertex_subst_cost::Function=(u, v)->0.`
- `edge_insert_cost::Function=e->1.`
- `edge_delete_cost::Function=e->1.`
- `edge_subst_cost::Function=(e1, e2)->0.`
The algorithm will always try to match two edges if it can, so if it is
preferrable to delete two edges rather than match these, it should be
reflected in the `edge_subst_cost` function.
By default, the algorithm uses constant operation costs. The
user can provide classical Minkowski costs computed from vertex
Expand All @@ -31,7 +38,7 @@ search in case the default heuristic is not satisfactory.
- Given two graphs ``|G₁| < |G₂|``, `edit_distance(G₁, G₂)` is faster to
compute than `edit_distance(G₂, G₁)`. Consider swapping the arguments
if involved costs are equivalent.
- The use of simple Minkowski costs can improve performance considerably.
- The use of a heuristic can improve performance considerably.
- Exploit vertex attributes when designing operation costs.
### References
Expand All @@ -49,51 +56,163 @@ julia> g1 = SimpleDiGraph([0 1 0 0 0; 0 0 1 0 0; 1 0 0 1 0; 0 0 0 0 1; 0 0 0 1 0
julia> g2 = SimpleDiGraph([0 1 0; 0 0 1; 1 0 0]);
julia> edit_distance(g1, g2)
(3.5, Tuple[(1, 2), (2, 1), (3, 0), (4, 3), (5, 0)])
(3.0, Tuple[(1, 3), (2, 1), (3, 2), (4, 0), (5, 0)])
```
"""
function edit_distance(
G₁::AbstractGraph,
G₂::AbstractGraph;
insert_cost::Function=v -> 1.0,
delete_cost::Function=u -> 1.0,
subst_cost::Function=(u, v) -> 0.5,
heuristic::Function=DefaultEditHeuristic,
vertex_insert_cost=nothing,
vertex_delete_cost=nothing,
vertex_subst_cost=nothing,
edge_insert_cost=nothing,
edge_delete_cost=nothing,
edge_subst_cost=nothing,
heuristic=nothing,
)
if isnothing(vertex_insert_cost) &&
isnothing(vertex_delete_cost) &&
isnothing(vertex_subst_cost) &&
isnothing(edge_insert_cost) &&
isnothing(edge_delete_cost) &&
isnothing(edge_subst_cost) &&
isnothing(heuristic)
heuristic = default_edit_heuristic
end
vertex_insert_cost = something(vertex_insert_cost, v -> 0.0)
vertex_delete_cost = something(vertex_delete_cost, v -> 0.0)
vertex_subst_cost = something(vertex_subst_cost, (u, v) -> 0.0)
edge_insert_cost = something(edge_insert_cost, e -> 1.0)
edge_delete_cost = something(edge_delete_cost, e -> 1.0)
edge_subst_cost = something(edge_subst_cost, (e1, e2) -> 0.0)
heuristic = something(heuristic, (λ, G₁, G₂) -> 0.0)
return _edit_distance(
G₁::AbstractGraph,
G₂::AbstractGraph,
vertex_insert_cost,
vertex_delete_cost,
vertex_subst_cost,
edge_insert_cost,
edge_delete_cost,
edge_subst_cost,
heuristic,
)
end

function _edit_distance(
G₁::AbstractGraph{T},
G₂::AbstractGraph{U},
vertex_insert_cost::Function,
vertex_delete_cost::Function,
vertex_subst_cost::Function,
edge_insert_cost::Function,
edge_delete_cost::Function,
edge_subst_cost::Function,
heuristic::Function,
) where {T<:Integer,U<:Integer}
isdirected = is_directed(G₁) || is_directed(G₂)

# compute the cost on edges due to associate u1 to v1 and u2 to v2
# u2 and v2 can eventually be 0
function association_cost(u1, u2, v1, v2)
cost = 0.0
if has_edge(G₁, u1, u2)
if has_edge(G₂, v1, v2)
cost += edge_subst_cost(Edge(u1, u2), Edge(v1, v2))
else
cost += edge_delete_cost(Edge(u1, u2))
end
else
if has_edge(G₂, v1, v2)
cost += edge_insert_cost(Edge(v1, v2))
end
end
if isdirected && u1 != u2
if has_edge(G₁, u2, u1)
if has_edge(G₂, v2, v1)
cost += edge_subst_cost(Edge(u2, u1), Edge(v2, v1))
else
cost += edge_delete_cost(Edge(u2, u1))
end
else
if has_edge(G₂, v2, v1)
cost += edge_insert_cost(Edge(v2, v1))
end
end
end
return cost
end

# A* search heuristic
h(λ) = heuristic(λ, G₁, G₂)

# initialize open set
OPEN = PriorityQueue{Vector{Tuple},Float64}()
for v in 1:nv(G₂)
enqueue!(OPEN, [(1, v)], subst_cost(1, v) + h([(1, v)]))
for v in vertices(G₂)
enqueue!(OPEN, [(T(1), v)], vertex_subst_cost(1, v) + h([(T(1), v)]))
end
enqueue!(OPEN, [(1, 0)], delete_cost(1) + h([(1, 0)]))
enqueue!(OPEN, [(T(1), U(0))], vertex_delete_cost(1) + h([(T(1), U(0))]))

c = 0
while true
# minimum (partial) edit path
λ, cost = peek(OPEN)
c += 1
dequeue!(OPEN)

if is_complete_path(λ, G₁, G₂)
return cost, λ
else
k, _ = λ[end]
vs = setdiff(1:nv(G₂), [v for (u, v) in λ])
u1, _ = λ[end]
u1 += T(1)
vs = setdiff(vertices(G₂), [v for (u, v) in λ])

if k < nv(G₁) # there are still vertices to process in G₁?
for v in vs
λ⁺ = [λ; (k + 1, v)]
enqueue!(OPEN, λ⁺, cost + subst_cost(k + 1, v) + h(λ⁺) - h(λ))
if u1 <= nv(G₁) # there are still vertices to process in G₁?
# we try every possible assignment of v1
for v1 in vs
λ⁺ = [λ; (u1, v1)]
new_cost = cost + vertex_subst_cost(u1, v1) + h(λ⁺) - h(λ)
for (u2, v2) in λ
new_cost += association_cost(u1, u2, v1, v2)
end
new_cost += association_cost(u1, u1, v1, v1) # handle self-loops

enqueue!(OPEN, λ⁺, new_cost)
end
# we try deleting v1
λ⁺ = [λ; (u1, U(0))]
new_cost = cost + vertex_delete_cost(u1) + h(λ⁺) - h(λ)
for u2 in outneighbors(G₁, u1)
# edges deleted later when assigning v2
u2 > u1 && continue
new_cost += edge_delete_cost(Edge(u1, u2))
end
λ⁺ = [λ; (k + 1, 0)]
enqueue!(OPEN, λ⁺, cost + delete_cost(k + 1) + h(λ⁺) - h(λ))
if isdirected
for u2 in inneighbors(G₁, u1)
# edges deleted later when assigning v2, and we should not count a self loop twice
u2 >= u1 && continue
new_cost += edge_delete_cost(Edge(u2, u1))
end
end
enqueue!(OPEN, λ⁺, new_cost)
else
# add remaining vertices of G₂ to the path
λ⁺ = [λ; [(0, v) for v in vs]]
total_insert_cost = sum(insert_cost, vs)
enqueue!(OPEN, λ⁺, cost + total_insert_cost + h(λ⁺) - h(λ))
# add remaining vertices of G₂ to the path by deleting them
λ⁺ = [λ; [(T(0), v) for v in vs]]
new_cost = cost + sum(vertex_insert_cost, vs)
for v1 in vs
for v2 in outneighbors(G₂, v1)
(v2 > v1 && v2 in vs) && continue # these edges will be deleted later
new_cost += edge_insert_cost(Edge(v1, v2))
end
if isdirected
for v2 in inneighbors(G₂, v1)
(v2 > v1 && v2 in vs) && continue # these edges will be deleted later
v1 == v2 && continue # we should not count a self loop twice
new_cost += edge_insert_cost(Edge(v2, v1))
end
end
end
enqueue!(OPEN, λ⁺, new_cost + h(λ⁺) - h(λ))
end
end
end
Expand All @@ -112,11 +231,40 @@ function is_complete_path(λ, G₁, G₂)
return length(us) == nv(G₁) && length(vs) == nv(G₂)
end

function DefaultEditHeuristic(λ, G₁::AbstractGraph, G₂::AbstractGraph)
vs = Set([v for (u, v) in λ])
delete!(vs, 0)
# edit_distance(G₁::AbstractGraph, G₂::AbstractGraph) =
# edit_distance(G₁, G₂,
# vertex_insert_cost=v -> 0.,
# vertex_delete_cost=u -> 0.,
# vertex_subst_cost=(u, v) -> 0.,
# edge_insert_cost=e -> 1.,
# edge_delete_cost=e -> 1.,
# edge_subst_cost=(e1, e2) -> 0.,
# heuristic=default_edit_heuristic)

return nv(G₂) - length(vs)
"""
compute an upper bound on the number of edges that can still be affected
"""
function default_edit_heuristic(λ, G₁::AbstractGraph, G₂::AbstractGraph)
us = setdiff(1:nv(G₁), [u for (u, v) in λ])
vs = setdiff(1:nv(G₂), [v for (u, v) in λ])
total_free_edges_g1 = 0
total_free_edges_g2 = 0
if !isempty(us)
total_free_edges_g1 = sum(u -> outdegree(G₁, u), us)
end
if !isempty(vs)
total_free_edges_g2 = sum(v -> outdegree(G₂, v), vs)
end
for (u1, v1) in λ
(u1 == 0 || v1 == 0) && continue
total_free_edges_g1 += count(u2 -> u2 in us, outneighbors(G₁, u1))
total_free_edges_g2 += count(v2 -> v2 in vs, outneighbors(G₂, v1))
end
if !is_directed(G₁) && !is_directed(G₂)
total_free_edges_g1 = total_free_edges_g1 / 2
total_free_edges_g2 = total_free_edges_g2 / 2
end
return abs(total_free_edges_g1 - total_free_edges_g2)
end

#-------------------------
Expand Down
71 changes: 34 additions & 37 deletions test/edit_distance.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,51 +4,48 @@
gquad = random_regular_graph(4, 2; rng=rng)
gpent = random_regular_graph(5, 2; rng=rng)

@testset "edit_distance $triangle, $quadrangle, $pentagon" for triangle in
testgraphs(gtri),
quadrangle in testgraphs(gquad),
pentagon in testgraphs(gpent)
g1 = star_graph(4)
g2 = cycle_graph(3)

d, λ = @inferred(
edit_distance(triangle, quadrangle, subst_cost=MinkowskiCost(1:3, 1:4))
)
@test d == 1.0
@test λ == Tuple[(1, 1), (2, 2), (3, 3), (0, 4)]

d, λ = @inferred(
edit_distance(quadrangle, triangle, subst_cost=MinkowskiCost(1:4, 1:3))
)
@test d == 1.0
@test λ == Tuple[(1, 1), (2, 2), (3, 3), (4, 0)]
vertex_insert_cost = v -> 1.0
vertex_delete_cost = v -> 2.0
vertex_subst_cost = (u, v) -> 3.0
edge_insert_cost = e -> 4.0
edge_delete_cost = e -> 5.0
edge_subst_cost = (e1, e2) -> 6.0

d, λ = @inferred(
edit_distance(triangle, pentagon, subst_cost=MinkowskiCost(1:3, 1:5))
)
@testset "undirected edit_distance" for G1 in testgraphs(g1), G2 in testgraphs(g2)
d, λ = @inferred(edit_distance(G1, G2))
@test d == 2.0
@test λ == Tuple[(1, 1), (2, 2), (3, 3), (0, 4), (0, 5)]

d, λ = @inferred(
edit_distance(pentagon, triangle, subst_cost=MinkowskiCost(1:5, 1:3))
edit_distance(
G1,
G2,
vertex_insert_cost=vertex_insert_cost,
vertex_delete_cost=vertex_delete_cost,
vertex_subst_cost=vertex_subst_cost,
edge_insert_cost=edge_insert_cost,
edge_delete_cost=edge_delete_cost,
edge_subst_cost=edge_subst_cost,
)
)
@test d == 2.0
@test λ == Tuple[(1, 1), (2, 2), (3, 3), (4, 0), (5, 0)]
# 1 vertex deletion, 3 vertex substitution, 1 edge insertio n, 1 edge deletion, 2 edge substitution
@test d == 32.0
end

@testset "Minkowski cost / bounded Minkowski" begin
cost = @inferred(MinkowskiCost(1:3, 1:3))
bcost = @inferred(BoundedMinkowskiCost(1:3, 1:3))
for i in 1:3
@test cost(i, i) == 0.0
@test bcost(i, i) == 2 / 3
end
g1 = DiGraph(4)
edges = [(1, 2), (1, 4), (2, 3), (3, 1), (3, 4), (4, 1), (1, 1), (4, 4)]
for e in edges
add_edge!(g1, e)
end
g2 = DiGraph(4)
edges = [(2, 1), (2, 3), (3, 1), (3, 2), (4, 1), (4, 2), (2, 2), (3, 3)]
for e in edges
add_edge!(g2, e)
end

g1c = complete_graph(4)
g2c = complete_graph(4)
rem_edge!(g2c, 1, 2)
@testset "edit_distance $g1, $g2" for g1 in testgraphs(g1c), g2 in testgraphs(g2c)
d, λ = @inferred(edit_distance(g1, g2))
@test d == 2.0
@test λ == Tuple[(1, 1), (2, 2), (3, 3), (4, 4)]
@testset "directed edit_distance" for G1 in testgraphs(g1), G2 in testgraphs(g2)
d, λ = @inferred(edit_distance(G1, G2))
@test d == 4.0
end
end

0 comments on commit 327740b

Please sign in to comment.