Eumcoz
Eumcoz

Reputation: 2458

Getting previous window value for processing late events

I am looking for a way to setup windowing to allow for lateness as well as let me calculate values based on previous values calculated for the session.

My sessions values are overall a unique identifier, and should never have collisions, but sessions can technically come in at anytime. On most sessions, most events are processed over 5 minutes, Allowing lateness of 1 day should satisfy any late events.

  stream
    .keyBy { jsonEvent => jsonEvent.findValue("session").toString }
    .window(ProcessingTimeSessionWindows.withGap(Time.minutes(5)))
    .allowedLateness(Time.days(1))
    .process { new SessionProcessor }
    .addSink { new HttpSink }

For each session I am finding the max value of a field, and checking that a few events did not happen(if they do happen, they will make the max value field zero). I decided to create a ProcessWindowFunction to do this.

Class SessionProcessor extends ProcessWindowFunction[ObjectNode, (String, String, String, Long), String, TimeWindow] {

   override def process(key: String, context: Context, elements: Iterable[ObjectNode], out: Collector[(String, String, String, Long)]): Unit = {
      //Parse and calculate data
      maxValue = if(badEvent1) 0 else maxValue
      maxValue = if(badEvent2) 0 else maxValue          
      out.collect((string1,string2,string3, maxValue))
   }
}

This works fine prior to allowing for late events. When a late event comes through, maxValue is recalculated and is outputted to HttpSink again. I am looking for a way so that I can calculate the delta of previous maxValue and late maxValue.

I am looking for a way to determine:

  1. If the call to the function is from a late event(I do not want to double count total number of sessions)
  2. What the new data is, or if there is a way, to store the previous calculated value.

Any help with this would be greatly appreciated.

Edit: New code used for ValueState

KafkaConsumer.scala

import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.connectors.kafka._
import org.apache.flink.streaming.util.serialization.JSONDeserializationSchema
import org.apache.flink.streaming.api.scala._
import com.fasterxml.jackson.databind.node.ObjectNode
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows
import org.apache.flink.streaming.api.windowing.time.Time


object KafkaConsumer {
   def main(args: Array[String]) {
      val env = StreamExecutionEnvironment.getExecutionEnvironment
      env.setStreamTimeCharacteristic(TimeCharacteristic.IngestionTime)
      val properties = getServerProperties
      val consumer = new FlinkKafkaConsumer010[ObjectNode]("test-topic", new JSONDeserializationSchema, properties)
      consumer.setStartFromLatest()
      val stream = env.addSource(consumer)

      stream
        .keyBy { jsonEvent => jsonEvent.findValue("data").findValue("query").findValue("session").toString }
        .window(TumblingProcessingTimeWindows.of(Time.seconds(5)))
        .allowedLateness(Time.days(1))
        .process {
          new SessionProcessor
        }
        .print
      env.execute("Kafka APN Consumer")
    }
  }

SessionProcessor.scala

import org.apache.flink.util.Collector
import com.fasterxml.jackson.databind.node.ObjectNode
import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor}
import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.windows.TimeWindow

class SessionProcessor extends ProcessWindowFunction[ObjectNode, (String, String, String, Long), String, TimeWindow] {

  final val previousValue = new ValueStateDescriptor("previousValue", classOf[Long])

  override def process(key: String, context: Context, elements: Iterable[ObjectNode], out: Collector[(String, String, String, Long)]): Unit = {

    val previousVal: ValueState[Long] = context.windowState.getState(previousValue)
    val pVal: Long = previousVal.value match {
      case i: Long => i
    }
    var session = ""
    var user = ""
    var department = ""
    var lVal: Long = 0

    elements.foreach( value => {
      var jVal: String = "0"
      if (value.findValue("data").findValue("query").has("value")) {
        jVal = value.findValue("data").findValue("query").findValue("value").toString replaceAll("\"", "")
      }
      session = value.findValue("data").findValue("query").findValue("session").toString replaceAll("\"", "")
      user = value.findValue("data").findValue("query").findValue("user").toString replaceAll("\"", "")
      department = value.findValue("data").findValue("query").findValue("department").toString replaceAll("\"", "")
      lVal = if (jVal.toLong > lVal) jVal.toLong else lVal
    })

    val increaseTime = lVal - pVal
    previousVal.update(increaseTime)
    out.collect((session, user, department, increaseTime))
  }
}

Upvotes: 1

Views: 1075

Answers (1)

David Anderson
David Anderson

Reputation: 43707

Here's an example that does something similar. It's hopefully reasonably self-explanatory, and should be easy enough to adapt to your needs.

The basic idea here is that you can use context.windowState(), which is per-window state made available thru the context passed to a ProcessWindowFunction. This windowState is in fact only useful for windows that fire multiple times, since each new window instance has a newly initialized (and empty) windowState store. For state that's shared across all windows (but still keyed), use context.globalState().

private static class DifferentialWindowFunction
  extends ProcessWindowFunction<Long, Tuple2<Long, Long>, String, TimeWindow> {

  private final static ValueStateDescriptor<Long> previousFiringState =
    new ValueStateDescriptor<>("previous-firing", LongSerializer.INSTANCE);

  private final static ReducingStateDescriptor<Long> firingCounterState =
    new ReducingStateDescriptor<>("firing-counter", new Sum(), LongSerializer.INSTANCE);

  @Override
  public void process(
      String key, 
      Context context, 
      Iterable<Long> values, 
      Collector<Tuple2<Long, Long>> out) {

    ValueState<Long> previousFiring = context.windowState().getState(previousFiringState);
    ReducingState<Long> firingCounter = context.windowState().getState(firingCounterState);

    Long output = Iterables.getOnlyElement(values);
    if (firingCounter.get() == null) {
      // first firing
      out.collect(Tuple2.of(0L, output));
    } else {
      // subsequent firing
      out.collect(Tuple2.of(firingCounter.get(), output - previousFiring.value()));    
    } 
    firingCounter.add(1L);
    previousFiring.update(output);
  }

  @Override
  public void clear(Context context) {
    ValueState<Long> previousFiring = context.windowState().getState(previousFiringState);
    ReducingState<Long> firingCounter = context.windowState().getState(firingCounterState);

    previousFiring.clear();
    firingCounter.clear();
  }
}

Upvotes: 3

Related Questions