Staging instance, all changes can be removed at any time

Skip to content
  • /*
     * Copyright (c) 2022-2023 The Software Heritage developers
     * See the AUTHORS file at the top-level directory of this distribution
     * License: GNU General Public License version 3, or any later version
     * See top-level LICENSE file for more information
     */
    
    package org.softwareheritage.graph.utils;
    
    import com.martiansoftware.jsap.*;
    import it.unimi.dsi.big.webgraph.LazyLongIterator;
    import it.unimi.dsi.big.webgraph.NodeIterator;
    import it.unimi.dsi.bits.LongArrayBitVector;
    import it.unimi.dsi.fastutil.longs.LongBigArrayBigList;
    import org.softwareheritage.graph.*;
    
    import java.io.IOException;
    import java.util.*;
    
    /* Lists all nodes nodes of the types given as argument, in topological order,
     * from leaves (contents, if selected) to the top (origins, if selected).
     *
     * This uses a DFS, so nodes are likely to be close to their neighbors.
     *
     * Some extra information is provided to allow more efficient consumption
     * of the output: number of ancestors, successors, and a sample of two ancestors.
     *
     * Sample invocation:
     *
     *   $ java -cp ~/swh-environment/swh-graph/java/target/swh-graph-*.jar -Xmx1000G -XX:PretenureSizeThreshold=512M -XX:MaxNewSize=4G -XX:+UseLargePages -XX:+UseTransparentHugePages -XX:+UseNUMA -XX:+UseTLAB -XX:+ResizeTLAB org.softwareheritage.graph.utils.TopoSort /dev/shm/swh-graph/default/graph dfs backward 'rev,rel,snp,ori' \
     *      | pv --line-mode --wait \
     *      | zstdmt \
     *      > /poolswh/softwareheritage/vlorentz/2022-04-25_toposort_rev,rel,snp,ori.txt.zst
     */
    
    public class TopoSort {
        private SwhBidirectionalGraph underlyingGraph;
        private Subgraph graph;
        private Subgraph transposedGraph;
        private boolean dfs; // Whether to run in BFS or DFS
        private LongBigArrayBigList ready;
        private long endIndex; // In BFS mode, index of the end of the queue where to pop from; in DFS mode, index of the
                               // top of the stack
    
        private class MyLongBigArrayBigList extends LongBigArrayBigList {
            public MyLongBigArrayBigList(long size) {
                super(size);
                this.size = size;
            }
        }
    
    
        public static void main(String[] args) throws IOException, ClassNotFoundException {
            if (args.length != 4) {
                System.err.println(
                        "Syntax: java org.softwareheritage.graph.utils.TopoSort <path/to/graph> {dfs|bfs} {forward|backward} <nodeTypes>");
                System.exit(1);
            }
            String graphPath = args[0];
            String algorithm = args[1];
            String directionString = args[2];
            String nodeTypes = args[3];
    
            TopoSort toposort = new TopoSort();
    
            toposort.loadGraph(graphPath, nodeTypes);
    
            if (directionString.equals("forward")) {
                toposort.swapGraphs();
            } else if (!directionString.equals("backward")) {
                System.err.println("Invalid direction " + directionString);
                System.exit(1);
            }
            if (algorithm.equals("dfs")) {
                toposort.dfs = true;
            } else if (algorithm.equals("bfs")) {
                toposort.dfs = false;
            } else {
                System.err.println("Invalid algorithm " + algorithm);
                System.exit(1);
            }
    
            toposort.endIndex = 0;
            toposort.toposort();
        }
    
        public void swapGraphs() {
            Subgraph tmp;
            tmp = graph;
            graph = transposedGraph;
            transposedGraph = tmp;
        }
    
        public void loadGraph(String graphBasename, String nodeTypes) throws IOException {
            System.err.println("Loading graph " + graphBasename + " ...");
            underlyingGraph = SwhBidirectionalGraph.loadMapped(graphBasename);
            System.err.println("Selecting subgraphs.");
            graph = new Subgraph(underlyingGraph, new AllowedNodes(nodeTypes));
            transposedGraph = graph.transpose();
            System.err.println("Graph loaded.");
    
            ready = new LongBigArrayBigList(graph.numNodes());
        }
    
        private void pushReady(long nodeId) {
            if (dfs) {
                endIndex++;
            }
            ready.add(nodeId);
        }
    
        private long popReady() {
            if (dfs) {
                return ready.removeLong(--endIndex);
            } else {
                return ready.getLong(endIndex++);
            }
        }
    
        private long readySize() {
            if (dfs) {
                return endIndex;
            } else {
                return ready.size64() - endIndex;
            }
        }
    
        private boolean readyNodes() {
            return readySize() > 0;
        }
    
        /* Prints nodes in topological order, based on a DFS or BFS. */
        public void toposort() {
            MyLongBigArrayBigList unvisitedAncestors = new MyLongBigArrayBigList(underlyingGraph.numNodes());
    
            /* First, push all leaves to the stack */
            System.err.println("Listing leaves.");
            long total_nodes = 0;
            NodeIterator nodeIterator = graph.nodeIterator();
            while (nodeIterator.hasNext()) {
                long currentNodeId = nodeIterator.nextLong();
                total_nodes++;
                long nbAncestors = graph.outdegree(currentNodeId);
                unvisitedAncestors.set(currentNodeId, nbAncestors);
                if (nbAncestors != 0) {
                    /* The node has ancestor, so it is not a leaf. */
                    continue;
                }
                pushReady(currentNodeId);
                if (readySize() % 10000000 == 0) {
                    float ready_size_f = readySize();
                    float total_nodes_f = total_nodes;
                    System.err.printf("Listed %.02f B leaves (out of %.02f B nodes)\n", ready_size_f / 1000000000.,
                            total_nodes_f / 1000000000.);
                }
            }
            System.err.println("Leaves listed, starting traversal.");
    
            System.out.format("SWHID,ancestors,successors,sample_ancestor1,sample_ancestor2\n");
            long printed_nodes = 0;
            while (readyNodes()) {
                long currentNodeId = popReady();
    
                /* Find its successors which are ready */
                LazyLongIterator successors = transposedGraph.successors(currentNodeId);
                long successorCount = 0;
                for (long successorNodeId; (successorNodeId = successors.nextLong()) != -1;) {
                    successorCount++;
                    LazyLongIterator successorAncestors = graph.successors(successorNodeId);
                    long nbUnvisitedAncestors = unvisitedAncestors.getLong(successorNodeId);
                    nbUnvisitedAncestors--;
                    if (nbUnvisitedAncestors == 0) {
                        pushReady(successorNodeId);
                    } else if (nbUnvisitedAncestors < 0) {
                        System.err.format("[BUG] %s has negative number of unvisited ancestors: %d\n", graph.getSWHID(successorNodeId), nbUnvisitedAncestors);
                    }
                    unvisitedAncestors.set(successorNodeId, nbUnvisitedAncestors);
                }
    
                String[] sampleAncestors = {"", ""};
                long ancestorCount = 0;
                LazyLongIterator ancestors = graph.successors(currentNodeId);
                for (long ancestorNodeId; (ancestorNodeId = ancestors.nextLong()) != -1;) {
                    if (ancestorCount < sampleAncestors.length) {
                        sampleAncestors[(int) ancestorCount] = graph.getSWHID(ancestorNodeId).toString();
                    }
                    ancestorCount++;
                }
    
                /*
                 * Print the node
                 *
                 * TODO: print its depth too?
                 */
                SWHID currentNodeSWHID = graph.getSWHID(currentNodeId);
                System.out.format("%s,%d,%d,%s,%s\n", currentNodeSWHID, ancestorCount, successorCount, sampleAncestors[0],
                        sampleAncestors[1]);
                printed_nodes += 1;
                if (printed_nodes % 10000000 == 0) {
                    float printed_nodes_f = printed_nodes;
                    float total_nodes_f = total_nodes;
                    System.err.printf("Sorted %.02f B nodes (out of %.02f B)\n", printed_nodes_f / 1000000000.,
                            total_nodes_f / 1000000000.);
                }
            }
        }
    }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment