Reputation: 14611

Apache Beam framework - sort in descending order

How do you sort in descending order using the Apache Beam framework?

I managed to create a word count pipeline which sorts alphabetically the output by word, but did not figure out how to invert the sorting order.

Here is the code:

public class SortedWordCount {

    public static void main(String[] args) {
        PipelineOptions options = PipelineOptionsFactory.create();
        Pipeline p = Pipeline.create(options);

        BufferedExternalSorter.Options options1 = BufferedExternalSorter.options();

        p.apply(TextIO.read().from("d:/dev/playground/apache/beam/word-count-beam/src/test/resources/bible/whole_bible.txt"))
                .apply("ExtractWords", ParDo.of(new DoFn<String, String>() {
                    @ProcessElement
                    public void processElement(ProcessContext c) {
                        for (String word : c.element().split(ExampleUtils.TOKENIZER_PATTERN)) {
                            if (!word.isEmpty()) {
                                c.output(word);
                            }
                        }
                    }
                }))
                .apply(Count.perElement())
                .apply(ParDo.of(new DoFn<KV<String, Long>, KV<String, Long>>() {
                    @ProcessElement
                    public void processElement(ProcessContext c){
                        KV<String, Long> element = c.element();
                        if(element.getKey().length() > 2) {
                            c.output(element);
                        }
                    }
                }))
                .apply("CreateKey", MapElements.via(new SimpleFunction<KV<String, Long>, KV<String, KV<String, Long>>>() {
                    public KV<String, KV<String, Long>> apply(KV<String, Long> input) {
                        return KV.of("sort", KV.of(input.getKey().toLowerCase(), input.getValue()));
                    }
                }))
                .apply(GroupByKey.create())
                .apply(SortValues.create(options1))
                .apply("FormatResults", MapElements.via(new SimpleFunction<KV<String, Iterable<KV<String, Long>>>, String>() {
                    @Override
                    public String apply(KV<String, Iterable<KV<String, Long>>> input) {
                        return StreamSupport.stream(input.getValue().spliterator(), false)
                                .map(value -> String.format("%20s: %s", value.getKey(), value.getValue()))
                                .collect(Collectors.joining(String.format("%n")));
                    }
                }))
                .apply(TextIO.write().to("bible"));
        // Run the pipeline.
        p.run().waitUntilFinish();
    }
}

This code produces a list of words sorted alphabetically with its respective counts:

           aaron: 350
       aaronites: 2
         abaddon: 1
         abagtha: 1
           abana: 1
          abarim: 4
           abase: 4
          abased: 4
         abasing: 1
          abated: 6
            abba: 3
            abda: 2
          abdeel: 1
            abdi: 3
          abdiel: 1
           abdon: 8
        abednego: 15
            abel: 16
 abelbethmaachah: 2
        abelmaim: 1

Edit 1:

After some debugging I know that the code uses the class:

org.apache.beam.sdk.extensions.sorter.InMemorySorter

This class uses a static final Comparator during the execution of the sort method:

private static final Comparator<byte[]> COMPARATOR = UnsignedBytes.lexicographicalComparator();

public Iterable<KV<byte[], byte[]>> sort() {
  checkState(!sortCalled, "sort() can only be called once.");

  sortCalled = true;

  Comparator<KV<byte[], byte[]>> kvComparator =
    new Comparator<KV<byte[], byte[]>>() {

      @Override
      public int compare(KV<byte[], byte[]> o1, KV<byte[], byte[]> o2) {
        return COMPARATOR.compare(o1.getKey(), o2.getKey());
      }
    };
  Collections.sort(records, kvComparator);
  return Collections.unmodifiableList(records);
}

There is no way to inject a comparator in this class.

Upvotes: 3

Answers (3)

haliluyaya

Reputation: 1

def sort_data(data):
  result = data.copy()
  result.sort(key=lambda item: item[0])
  return result

with beam.Pipeline() as pipeline:
  intrim = pipeline | 'Data' >> beam.Create([
          ('p', 1),
          ('a', 2),
          ('p', 3),
          ('m', 2),])
  intrim = intrim | beam.Map(lambda it: (0, it)) # same key
  intrim = intrim | 'window' >> beam.WindowInto(beam.window.GlobalWindows()) # same window
  intrim = intrim | beam.GroupByKey() # sink all to one
  intrim = intrim | beam.Map(lambda item: item[1]) # remove the dummy key
  intrim = intrim | beam.Map(sort_data) # sort the one which is all
  intrim = intrim | beam.Map(print)

Upvotes: -1

gil.fernandes

Reputation: 14611

I ended up following jkff's advice. And re-wrote the small WordCount using Apache Beam. I also got rid of SortValues and simply grouped the records in a single key and then did the sorting by myself.

This is what I came up with:

import org.apache.beam.examples.common.ExampleUtils;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.*;
import org.apache.beam.sdk.values.KV;

import java.util.ArrayList;
import java.util.function.Supplier;
import java.util.stream.StreamSupport;

public class DescendingWordCount {

    public static void main(String[] args) {
        PipelineOptions options = PipelineOptionsFactory.create();
        Pipeline p = Pipeline.create(options);
        p.apply(TextIO.read().from("d:/whole_bible.txt"))
                .apply("ExtractWords", ParDo.of(new DoFn<String, String>() {
                    @ProcessElement
                    public void processElement(ProcessContext c) {
                        for (String word : c.element().split(ExampleUtils.TOKENIZER_PATTERN)) {
                            if (word.length() > 1) {
                                c.output(word.toLowerCase());
                            }
                        }
                    }
                }))
                .apply(Count.perElement())
                .apply("CreateKey", ParDo.of(new DoFn<KV<String, Long>, KV<String, KV<String, Long>>>() {
                    @ProcessElement
                    public void processElement(ProcessContext c) {
                        KV<String, Long> element = c.element();
                        String key = element.getKey();
                        c.output(KV.of("single", KV.of(key, element.getValue())));
                    }
                }))
                .apply(GroupByKey.create())
                .apply("FormatResults",
                        MapElements.via(
                                new SimpleFunction<KV<String, Iterable<KV<String, Long>>>, String>() {
                                    @Override
                                    public String apply(KV<String, Iterable<KV<String, Long>>> input) {
                                        return StreamSupport.stream(input.getValue().spliterator(), false)
                                                .collect((Supplier<ArrayList<KV<String, Long>>>) ArrayList::new,
                                                        (al, kv) -> al.add(KV.of(kv.getKey(), kv.getValue())),
                                                        (sb, kv) -> {
                                                        })
                                                .stream()
                                                .sorted((kv1, kv2) -> kv2.getKey().compareTo(kv1.getKey()))
                                                .collect(StringBuilder::new,
                                                        (sb, kv) -> sb.append(String.format("%20s : %d%n", kv.getKey(), kv.getValue())),
                                                        (sb, kv) -> {
                                                        }).toString();
                                    }
                                }
                        ))
                .apply(TextIO.write().withNumShards(1).to("minimal-wordcount-bible"));
        p.run().waitUntilFinish();
    }
}

This prints out an output like e.g:

          zuzims : 1
     zurishaddai : 5
          zuriel : 1
             zur : 5
            zuph : 3
            zuar : 5
       zorobabel : 3
         zorites : 1
          zoreah : 1
      zorathites : 1
           zorah : 8
          zophim : 1
          zophar : 4
          zophai : 1
          zophah : 2
          zoheth : 1
        zoheleth : 1
           zohar : 4
         zobebah : 1
           zobah : 11
            zoba : 2
            zoar : 10
            zoan : 7
           zizah : 1
            ziza : 2
             ziz : 1
          zithri : 1
        zipporah : 3

Upvotes: 2

jkff

Reputation: 17913

You can extract the Iterable<KV<String, Long>> into a List<KV<String, Long>> and reverse the list using Collections.reverse().

Upvotes: 0

Apache Beam framework - sort in descending order

Edit 1:

Answers (3)

Related Questions