diff --git a/src/connectivity.jl b/src/connectivity.jl index 788215dd8..45526d25d 100644 --- a/src/connectivity.jl +++ b/src/connectivity.jl @@ -217,102 +217,159 @@ julia> strongly_connected_components(g) [8, 9] [5, 6, 7] [1, 2, 3, 4] - [10, 11] + [10, 11] + + +This currently uses a modern variation on Tarjan's algorithm, largely derived from algorithm 3 in David J. Pearce's +preprint: https://homepages.ecs.vuw.ac.nz/~djp/files/IPL15-preprint.pdf , with some changes & tradeoffs when unrolling it to an +imperative algorithm. ``` """ function strongly_connected_components end # see https://github.com/mauro3/SimpleTraits.jl/issues/47#issuecomment-327880153 for syntax + @traitfn function strongly_connected_components( g::AG::IsDirected ) where {T<:Integer,AG<:AbstractGraph{T}} - zero_t = zero(T) - one_t = one(T) - nvg = nv(g) - count = one_t + if iszero(nv(g)) + return Vector{Vector{T}}() + end + return _strongly_connected_components_tarjan(g, infer_nb_iterstate_type(g)) +end + +# In recursive form, Tarjans algorithm has a recursive call inside a for loop. +# To save the loop state of each recursive step in a stack efficiently, +# we need to infer the type of its state (which should almost always be an int). +destructure_type(x) = Any +destructure_type(::Type{Union{Nothing,Tuple{<:Any,B}}}) where {B} = B + +infer_nb_iterstate_type(::AbstractSimpleGraph{T}) where {T} = T + +function infer_nb_iterstate_type(g::AbstractGraph{T}) where {T} + # If no specific dispatch is given, we peek at the first vertex and use Base.Iterator magic to try infering the type. + return destructure_type( + Base.Iterators.approx_iter_type(typeof(outneighbors(g, one(T)))) + ) +end - index = zeros(T, nvg) # first time in which vertex is discovered - stack = Vector{T}() # stores vertices which have been discovered and not yet assigned to any component - onstack = zeros(Bool, nvg) # false if a vertex is waiting in the stack to receive a component assignment - lowlink = zeros(T, nvg) # lowest index vertex that it can reach through back edge (index array not vertex id number) - parents = zeros(T, nvg) # parent of every vertex in dfs +# Vertex size threshold below which it isn't worth keeping the DFS iteration state. +is_large_vertex(g, v) = length(outneighbors(g, v)) >= 1024 +is_unvisited(data::AbstractVector, v::Integer) = iszero(data[v]) + +# The key idea behind any variation on Tarjan's algorithm is to use DFS and pop off found components. +# Whenever we are forced to backtrack, we are in a bottom cycle of the remaining graph, +# which we accumulate in a stack while backtracking, until we reach a local root. +# A local root is a vertex from which we cannot reach any node that was visited earlier by DFS. +# As such, when we have backtracked to it, we may pop off the contents the stack as a strongly connected component. +function _strongly_connected_components_tarjan_with_index( + g::AG, nb_iter_statetype::Type{S} +) where {T<:Integer,AG<:AbstractGraph{T},S} + nvg = nv(g) + one_count = one(T) + count = nvg # (Counting downwards) Visitation order for the branch being explored. Backtracks when we pop an scc. + component_count = one_count # Index of the current component being discovered. + # Invariant 1: component_count is always smaller than count. + # Invariant 2: if rindex[v] < component_count, then v is in components[rindex[v]]. + # This trivially lets us tell if a vertex belongs to a previously discovered scc without any extra bits, + # just inequalities that combine naturally with other checks. + + is_component_root = Vector{Bool}(undef, nvg) # Fields are set when tracing and read when backtracking, so can be initialized undef. + rindex = zeros(T, nvg) components = Vector{Vector{T}}() # maintains a list of scc (order is not guaranteed in API) + stack = Vector{T}() # while backtracking, stores vertices which have been discovered and not yet assigned to any component dfs_stack = Vector{T}() + largev_iterstate_stack = Vector{Tuple{T,S}}() # For large vertexes we push the iteration state into a stack so we may resume it. + # adding this last stack fixes the O(|E|^2) performance bug that could previously be seen in large star graphs. + # The Tuples come from Julia's iteration protocol, and the code is structured so that we never push a Nothing into this last stack. @inbounds for s in vertices(g) - if index[s] == zero_t - index[s] = count - lowlink[s] = count - onstack[s] = true - parents[s] = s - push!(stack, s) - count = count + one_t + if is_unvisited(rindex, s) + rindex[s] = count + is_component_root[s] = true + count -= one_count # start dfs from 's' push!(dfs_stack, s) + if is_large_vertex(g, s) + push!(largev_iterstate_stack, iterate(outneighbors(g, s))) + end - while !isempty(dfs_stack) - v = dfs_stack[end] # end is the most recently added item - u = zero_t - @inbounds for v_neighbor in outneighbors(g, v) - if index[v_neighbor] == zero_t - # unvisited neighbor found - u = v_neighbor - break - # GOTO A push u onto DFS stack and continue DFS - elseif onstack[v_neighbor] - # we have already seen n, but can update the lowlink of v - # which has the effect of possibly keeping v on the stack until n is ready to pop. - # update lowest index 'v' can reach through out neighbors - lowlink[v] = min(lowlink[v], index[v_neighbor]) + @inbounds while !isempty(dfs_stack) + v = dfs_stack[end] #end is the most recently added item + outn = outneighbors(g, v) + v_is_large = is_large_vertex(g, v) + next = v_is_large ? pop!(largev_iterstate_stack) : iterate(outn) + while next !== nothing + (v_neighbor, state) = next + if is_unvisited(rindex, v_neighbor) + break #GOTO A: push v_neighbor onto DFS stack and continue DFS. + # Note: This is no longer quadratic for (very large) tournament graphs or star graphs, + # as we save the iteration state in largev_iterstate_stack for large vertices. + # The loop is tight so not saving the state still benchmarks well unless the vertex orders are large enough to make quadratic growth kick in. + elseif (rindex[v_neighbor] > rindex[v]) + rindex[v] = rindex[v_neighbor] + is_component_root[v] = false end + next = iterate(outn, state) end - if u == zero_t + if isnothing(next) # While loop above ended naturally. # All out neighbors already visited or no out neighbors # we have fully explored the DFS tree from v. # time to start popping. popped = pop!(dfs_stack) - lowlink[parents[popped]] = min( - lowlink[parents[popped]], lowlink[popped] - ) - - if index[v] == lowlink[v] - # found a cycle in a completed dfs tree. - component = Vector{T}() - - while !isempty(stack) # break when popped == v - # drain stack until we see v. - # everything on the stack until we see v is in the SCC rooted at v. - popped = pop!(stack) - push!(component, popped) - onstack[popped] = false - # popped has been assigned a component, so we will never see it again. - if popped == v - # we have drained the stack of an entire component. - break - end + if is_component_root[popped] # Found an SCC rooted at popped which is a bottom cycle in remaining graph. + component = T[popped] + count += one_count # We also backtrack the count to reset it to what it would be if the component were never in the graph. + while !isempty(stack) && (rindex[popped] >= rindex[stack[end]]) # Keep popping its children from the backtracking stack. + newpopped = pop!(stack) + rindex[newpopped] = component_count # Bigger than the value of anything unexplored. + push!(component, newpopped) # popped has been assigned a component, so we will never see it again. + count += one_count end - - reverse!(component) + rindex[popped] = component_count + component_count += one_count push!(components, component) + else # Invariant: the DFS stack can never be empty in this second branch where popped is not a root. + if (rindex[popped] > rindex[dfs_stack[end]]) + rindex[dfs_stack[end]] = rindex[popped] + is_component_root[dfs_stack[end]] = false + end + # Because we only push to stack when backtracking, it gets filled up less than in Tarjan's original algorithm. + push!(stack, popped) # For DAG inputs, the stack variable never gets touched at all. end else # LABEL A # add unvisited neighbor to dfs - index[u] = count - lowlink[u] = count - onstack[u] = true - parents[u] = v - count = count + one_t - - push!(stack, u) + (u, state) = next push!(dfs_stack, u) + if v_is_large + push!(largev_iterstate_stack, next) # Because this is the else branch of isnothing(state), we can push this on the stack. + end + if is_large_vertex(g, u) + push!(largev_iterstate_stack, iterate(outneighbors(g, u))) # Because u is large, iterate cannot return nothing, so we can push this on stack. + end + is_component_root[u] = true + rindex[u] = count + count -= one_count # next iteration of while loop will expand the DFS tree from u. end end end end + #Unlike in the original Tarjans, rindex are potentially also worth returning here. + # For any v, v is in components[rindex[v]], s it acts as a lookup table for components. + # Scipy's graph library returns only that and lets the user sort by its values. + return components # ,rindex +end + +function _strongly_connected_components_tarjan( + g::AG, nb_iter_statetype::Type{S} +) where {T<:Integer,AG<:AbstractGraph{T},S} + components, rindex = _strongly_connected_components_tarjan_with_index( + g, nb_iter_statetype + ) return components end