Skip to content

Commit

Permalink
changed processTransposeBatch to prune duplicate arcs
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoscari committed Feb 20, 2023
1 parent e0ae0c7 commit 10c1152
Showing 1 changed file with 46 additions and 18 deletions.
64 changes: 46 additions & 18 deletions src/it/unimi/dsi/webgraph/Transform.java
Original file line number Diff line number Diff line change
Expand Up @@ -1199,7 +1199,7 @@ public int[] successorArray() {
final int numPairs = this.numPairs;
// Neither quicksort nor heaps are stable, so we reestablish order here.
IntArrays.quickSort(successor, 0, numPairs);
if (numPairs!= 0) {
if (numPairs != 0) {
int p = 0;
for (int j = 1; j < numPairs; j++) if (successor[p] != successor[j]) successor[++p] = successor[j];
outdegree = p + 1;
Expand Down Expand Up @@ -1311,6 +1311,8 @@ class InternalArcLabelledNodeIterator extends ArcLabelledNodeIterator {
private int last;
/** The outdegree of the current node (valid if {@link #last} is not -1). */
private int outdegree;
/** The number of pairs associated with the current node (valid if {@link #last} is not -1). */
private int numPairs;
/** The successors of the current node (valid if {@link #last} is not -1);
* only the first {@link #outdegree} entries are meaningful. */
private int[] successor;
Expand All @@ -1319,7 +1321,7 @@ class InternalArcLabelledNodeIterator extends ArcLabelledNodeIterator {
private Label[] label;

public InternalArcLabelledNodeIterator(final int upperBound) throws IOException {
this(upperBound, null, null, null, null, null, -1, 0, IntArrays.EMPTY_ARRAY, Label.EMPTY_LABEL_ARRAY);
this(upperBound, null, null, null, null, null, -1, -1, IntArrays.EMPTY_ARRAY, Label.EMPTY_LABEL_ARRAY);
}

public InternalArcLabelledNodeIterator(final int upperBound, final InputBitStream[] baseIbs, final InputBitStream[] baseLabelInputBitStream, final int[] refArray, final int[] prevTarget, final int[] inputStreamLength, final int last, final int outdegree, final int successor[], final Label[] label) throws IOException {
Expand Down Expand Up @@ -1377,8 +1379,10 @@ public boolean hasNext() {

@Override
public int nextInt() {
if (! hasNext()) throw new NoSuchElementException();
last++;
int d = 0;
outdegree = -1;
int i;

try {
Expand All @@ -1395,8 +1399,8 @@ public int nextInt() {
if (--inputStreamLength[i] == 0) {
queue.dequeue();
batchIbs[i].close();
labelInputBitStream[i].close();
batchIbs[i] = null;
labelInputBitStream[i].close();
labelInputBitStream[i] = null;
}
else {
Expand All @@ -1410,8 +1414,19 @@ public int nextInt() {
}
d++;
}

numPairs = d;
}
catch(final IOException e) {
e.printStackTrace();
throw new RuntimeException(this + " " + e);
}

// Compute outdegree
if (outdegree == -1) {
final int numPairs = this.numPairs;
// Neither quicksort nor heaps are stable, so we reestablish order here.
it.unimi.dsi.fastutil.Arrays.quickSort(0, d, (x, y) -> Integer.compare(successor[x], successor[y]),
it.unimi.dsi.fastutil.Arrays.quickSort(0, numPairs, (x, y) -> Integer.compare(successor[x], successor[y]),
(x, y) -> {
final int t = successor[x];
successor[x] = successor[y];
Expand All @@ -1420,12 +1435,16 @@ public int nextInt() {
label[x] = label[y];
label[y] = l;
});
}
catch(final IOException e) {
throw new RuntimeException(e);

if (numPairs != 0) {
// Avoid returning the duplicate arcs
int p = 0;
for (int j = 1; j < numPairs; j++) if (successor[p] != successor[j]) successor[++p] = successor[j];
outdegree = p + 1;
}
else outdegree = 0;
}

outdegree = d;
return last;
}

Expand Down Expand Up @@ -1604,6 +1623,12 @@ public static int processTransposeBatch(final int n, final int[] source, final i
batchFile.deleteOnExit();
batches.add(batchFile);
final OutputBitStream batch = new OutputBitStream(batchFile);

final File labelFile = File.createTempFile("label-", ".bits", tempDir);
labelFile.deleteOnExit();
labelBatches.add(labelFile);
final OutputBitStream labelObs = new OutputBitStream(labelFile);

int u = 0;

if (n != 0) {
Expand All @@ -1616,32 +1641,35 @@ public static int processTransposeBatch(final int n, final int[] source, final i
batch.writeDelta(prevSource);
batch.writeDelta(target[0]);

labelBitStream.position(start[0]);
prototype.fromBitStream(labelBitStream, source[0]);
prototype.toBitStream(labelObs, target[0]);

for(int i = 1; i < n; i++) {
if (source[i] != prevSource) {
batch.writeDelta(source[i] - prevSource);
batch.writeDelta(target[i]);
prevSource = source[i];

labelBitStream.position(start[i]);
prototype.fromBitStream(labelBitStream, source[i]);
prototype.toBitStream(labelObs, target[i]);
}
else if (target[i] != target[i - 1]) {
// We don't write duplicate pairs
batch.writeDelta(0);
batch.writeDelta(target[i] - target[i - 1] - 1);

labelBitStream.position(start[i]);
prototype.fromBitStream(labelBitStream, source[i]);
prototype.toBitStream(labelObs, target[i]);
}
}
}

else batch.writeDelta(0);

batch.close();

final File labelFile = File.createTempFile("label-", ".bits", tempDir);
labelFile.deleteOnExit();
labelBatches.add(labelFile);
final OutputBitStream labelObs = new OutputBitStream(labelFile);
for (int i = 0; i < n; i++) {
labelBitStream.position(start[i]);
prototype.fromBitStream(labelBitStream, source[i]);
prototype.toBitStream(labelObs, target[i]);
}
labelObs.close();

return u;
Expand Down

5 comments on commit 10c1152

@vigna
Copy link
Owner

@vigna vigna commented on 10c1152 Feb 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But pruning duplicate arcs cannot be done without a LabelMergeStrategy. How are you going to decide which label to keep otherwise? Also, why are you getting duplicates?

@lfoscari
Copy link
Author

@lfoscari lfoscari commented on 10c1152 Feb 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem of duplicate arcs arose when using a very small batch size, causing the same arc to be in two separate batches, in that case processTransposeBatch cannot remove the duplicates because it works on one batch at a time.
If I understood correctly inside BatchGraph this problem is solved when computing the successorArray(), in the case of ArcLabelledBatchGraph, which inherits from ArcLabelledImmutableSequentialGraph and therefore must implement successor(), to avoid code duplication the removal of duplicates was moved to the nextInt() function, which also computes the right value for the outdegree.

I was thinking on how to solve the problem of duplicate arcs with different labels, I forgot about the LabelMergeStrategy, I'm thinking of passing it to the ScatteredLabelledArcsASCIIGraph constructor and using it both when processing the batch and when computing the successors. Are we not interested in the equal arcs with different labels though?

Edit: The labels are structured as key-value associations, which means we don't need multiple arcs with different labels. We simply merge the two labels and the process is invertible. This answers my question.

@vigna
Copy link
Owner

@vigna vigna commented on 10c1152 Feb 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, my question was: why is the ASCII class adding twice the same arc. That shouldn't happen (in theory).

OK, what happened here is the following (I think). Sorting successors and deduplicating them is expensive. When I implemented parallel compression, I realized that since parallel compression works by scanning once the batch graph, building a number of iterators, and then compressing (which implies a second pass) sorting and deduplicating in NodeIterator.nextInt() was foolish because in the first pass there is no need for that. So I moved deduplication to successorArray(). That change was never ported to the labeled version because up to a few weeks ago we did not have parallel compression of labeled graphs.

In the labeled version there is no deduplication because there is no mapOffline() method, so a labeled batch cannot contain duplicates.

Having parallel arcs (e.g., the same successors many times) is a possible extension for the future but a lot of code depends on the successor lists be made of distinct elements. In that case we suggest to use a label class that contains a list of values.

So what I suggest is that, first of all, you move the sorting code out as in the non-labelled case. Then, we add a constructor to the batch graph with a LabelMergeStrategy. My idea is that if the code finds to parallel arcs and there is no LabelMergeStrategy, an exception is thrown. Otherwise, it uses the strategy to generate a single label. For existing code (transposition) throwing an exception is fine. For the new class, the user can pass a strategy.

@lfoscari
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I understand correctly it makes more sense to dedup only when computing the successor array. But this causes an issue with the array of labels, which is created on the first call to nextInt and sized according to the outdegree, but if it's impossible to accurately compute the outdegree without sorting first, the solution is simple, we can override the labelArray method and do the deduplication (keeping in mind not to do it again when calling successorArray() or viceversa).

@lfoscari
Copy link
Author

@lfoscari lfoscari commented on 10c1152 Feb 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe I'm missing something, can you please elaborate on exactly why there is no need for sorting and deduplication in the first pass? Because as far as I understand the method NodeIterator.nextInt() reads from multiple batches, each batch is already sorted and deduplicated, but were is the guarantee that combining their contents will yield arcs without duplication?

Please sign in to comment.