Reputation: 13
I am using a pdf to extract some data and convert it into pandas dataframe. I'm using below code from dash tutorial to display the data on dash app. What I want to do next is be able to upload the pdf instead of predefining it in the code. I could find similar examples for csv but with pdf it doesn't work in the same way.
pdfFileObj = open('test.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
#some operations on pdf to produce df1 and df2 using PyPDF2
app = Dash(__name__)
app.layout = html.Div([
html.H4('Some title'),
html.P(id='table_out'),
dash_table.DataTable(
id='table',
columns=[{"name": i, "id": i}
for i in df1.columns],
data=df1.to_dict('records'),
style_cell=dict(textAlign='left'),
style_header=dict(backgroundColor="paleturquoise"),
style_data=dict(backgroundColor="lavender")
),
html.H4("Some title"),
html.P(id='table_out1'),
dash_table.DataTable(
id='table1',
columns=[{"name": i, "id": i}
for i in df2.columns],
data=df2.to_dict('records'),
style_cell=dict(textAlign='left'),
style_header=dict(backgroundColor="paleturquoise"),
style_data=dict(backgroundColor="lavender")
)
])
@app.callback(
Output('table_out', 'children'),
Input('table', 'active_cell'))
@app.callback(
Output('table_out1', 'children'),
Input('table1', 'active_cell'))
def update_graphs(active_cell):
if active_cell:
cell_data = df1.iloc[active_cell['row']][active_cell['column_id']]
cell_data2 = df2.iloc[active_cell['row']][active_cell['column_id']]
return cell_data, cell_data2
#return f"Data: \"{cell_data}\" from table cell: {active_cell}"
return "Click the table"
app.run_server(debug=True)
Upvotes: 1
Views: 1073
Reputation: 21
I've had success with the below code, which uses the tabula package to parse contents of a pdf into Pandas dataframe and display that in the app.
import dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output, State
import dash_table
import pandas as pd
import base64
import io
import tabula
app = dash.Dash()
# Callback to parse contents of a pdf
@app.callback(Output('pdf-viewer', 'data'),
Output('pdf-viewer', 'columns'),
Input('pdf-upload', 'contents'),
State('pdf-upload', 'filename'),
prevent_initial_call=True
)
def pdf_output(contents, filename):
if contents is not None:
content_type, content_string = contents.split(',')
decoded = base64.b64decode(content_string)
#My pdf only has one page and one table with two columns
df = tabula.read_pdf(io.BytesIO(decoded), pages=1, pandas_options={'header': None})[0]
df.columns = ['Parameter', 'Value']
return df.to_dict('records'), [{"name": i, "id": i, 'editable':True} for i in df.columns]
#Upload component:
pdf_load = dcc.Upload(id='pdf-upload',
children=html.Div(['Drag and Drop or ', html.A('Select PDF files')]),
style={'width': '90%', 'height': '60px', 'lineHeight': '60px',
'borderWidth': '1px', 'borderStyle': 'dashed',
'borderRadius': '5px', 'textAlign': 'center', 'margin': '10px'},
)
#Table to view output from pdf:
pdf_table = dash_table.DataTable(id='pdf-viewer',
page_action='none',
fixed_rows={'headers': True},
style_table={'height': 500, 'overflowY': 'auto'},
style_header={'overflowY': 'auto'}
)
#Place into the app
app.layout = html.Div([html.H4('Some title'),
pdf_load,
html.Br(),
pdf_table
])
if __name__ == '__main__':
app.run_server(debug = False)
Output (no formatting applied) should look something like this: Example dash
Upvotes: 2