Reputation: 37
In my flow, I query Hive, then update the file names, and then I want to combine these csvs into one excel workbook with multiple spreadsheets. I was able to make the two csv files merge into one excel workbook with multiple spreadsheets using this code. How do I get the script to use the two files from the nifi flow instead of pulling the files from a directory on my pc? I've seen that can do "flowFile = session.get()" but does this line capture both flowfiles?
import glob
import csv
import xlwt
import os
import xlsxwriter
import datetime
from org.apache.commons.io import IOUtils
from java.nio.charset import StandardCharsets
from org.apache.nifi.processor.io import StreamCallback
wb = xlsxwriter.Workbook("combined_at%s.xlsx" % datetime.datetime.now().strftime('%H-%M-%S'))
flowFile = session.get()
replacer = ",[]\"\"'\'"
worksheet = wb.add_worksheet("make")
worksheet2 = wb.add_worksheet("ownership")
worksheet3 = wb.add_worksheet("marital")
worksheet4 = wb.add_worksheet("drivers")
worksheet5 = wb.add_worksheet("vehicles")
worksheet6 = wb.add_worksheet("age")
worksheet7 = wb.add_worksheet("vyear")
def printHashedEmail(split_row, worksheet, index):
for y in replacer:
split_row[0] = split_row[0].replace(y, "")
worksheet.write(index, 0, split_row[0])
return;
def printOtherOnes(split_row, worksheet,index,non_changing_index):
for y in replacer:
split_row[non_changing_index] = split_row[non_changing_index].replace(y, "")
worksheet.write(index, 1, split_row[non_changing_index])
return;
with open("1.csv") as csv1:
i = 0
j = 0
for row in csv1:
split_row = row.split(",")
if split_row[2] != "":
printHashedEmail(split_row, worksheet, i)
printOtherOnes(split_row,worksheet,i,2)
i = i+1
if split_row[3].strip() != "":
printHashedEmail(split_row, worksheet2, j)
printOtherOnes(split_row, worksheet2, j, 3)
j = j+1
with open("2.csv") as csv1:
i = 0; k = 0; j = 0; l = 0;m = 0;
for row in csv1:
split_row = row.split(",")
if split_row[2] != "":
printHashedEmail(split_row, worksheet3, i)
printOtherOnes(split_row, worksheet3, i, 2)
i = i + 1
if split_row[3].strip() != "":
printHashedEmail(split_row, worksheet4, j)
printOtherOnes(split_row, worksheet4, j, 3)
j = j + 1
if split_row[5] != "":
printHashedEmail(split_row, worksheet5, l)
printOtherOnes(split_row, worksheet5, l, 5)
l = l + 1
if split_row[4].strip() != "":
printHashedEmail(split_row, worksheet6, k)
printOtherOnes(split_row, worksheet6, k, 4)
k = k + 1
if split_row[6].strip() != "":
printHashedEmail(split_row,worksheet7,m)
printOtherOnes(split_row, worksheet7, m, 6)
m = m + 1
wb.close()
print("Done")
After manipulation, I would like the excel file to exit the ExecuteScriptProcessor so I can do more with it
Upvotes: 0
Views: 635
Reputation: 28644
check different session.get()
methods.
for example session.get(2)
will try get 2 first files from incoming queue.
if you got only one you can call session.rollback()
to return it back to queue.
but the problem here that files in queue could be in different order then you expected. just imagine you have 3 files in the incoming queue.
with session.get(FlowFileFilter filter)
you can select from incoming queue 2 files that matched by some attribute.
Upvotes: 2