user20153724
user20153724

Reputation: 13

Using Dash to upload a PDF and display pandas dataframe extracted from it

I am using a pdf to extract some data and convert it into pandas dataframe. I'm using below code from dash tutorial to display the data on dash app. What I want to do next is be able to upload the pdf instead of predefining it in the code. I could find similar examples for csv but with pdf it doesn't work in the same way.

pdfFileObj = open('test.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
#some operations on pdf to produce df1 and df2 using PyPDF2

app = Dash(__name__)
app.layout = html.Div([
    html.H4('Some title'),
    html.P(id='table_out'),
    dash_table.DataTable(
        id='table',
        columns=[{"name": i, "id": i} 
                 for i in df1.columns],
        data=df1.to_dict('records'),
        style_cell=dict(textAlign='left'),
        style_header=dict(backgroundColor="paleturquoise"),
        style_data=dict(backgroundColor="lavender")
    ),
    html.H4("Some title"),
    html.P(id='table_out1'),
    dash_table.DataTable(
        id='table1',
        columns=[{"name": i, "id": i} 
                 for i in df2.columns],
        data=df2.to_dict('records'),
        style_cell=dict(textAlign='left'),
        style_header=dict(backgroundColor="paleturquoise"),
        style_data=dict(backgroundColor="lavender")
    )
])

@app.callback(
    Output('table_out', 'children'), 
    Input('table', 'active_cell'))
    
@app.callback(
    Output('table_out1', 'children'), 
    Input('table1', 'active_cell'))
    
def update_graphs(active_cell):
    if active_cell:
        cell_data = df1.iloc[active_cell['row']][active_cell['column_id']]
        cell_data2 = df2.iloc[active_cell['row']][active_cell['column_id']]
        return cell_data, cell_data2

        #return f"Data: \"{cell_data}\" from table cell: {active_cell}"
    return "Click the table"
app.run_server(debug=True)

Upvotes: 1

Views: 1073

Answers (1)

pyDan2022
pyDan2022

Reputation: 21

I've had success with the below code, which uses the tabula package to parse contents of a pdf into Pandas dataframe and display that in the app.

import dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output, State
import dash_table
import pandas as pd
import base64
import io
import tabula

app = dash.Dash()

# Callback to parse contents of a pdf
@app.callback(Output('pdf-viewer', 'data'),
              Output('pdf-viewer', 'columns'),
              Input('pdf-upload', 'contents'),
              State('pdf-upload', 'filename'),
              prevent_initial_call=True
              )
def pdf_output(contents, filename):
    if contents is not None:
        content_type, content_string = contents.split(',')
        decoded = base64.b64decode(content_string)
        
        #My pdf only has one page and one table with two columns
        df = tabula.read_pdf(io.BytesIO(decoded), pages=1, pandas_options={'header': None})[0]
        df.columns = ['Parameter', 'Value']
        
        return df.to_dict('records'), [{"name": i, "id": i, 'editable':True} for i in df.columns]

#Upload component:
pdf_load = dcc.Upload(id='pdf-upload',
                      children=html.Div(['Drag and Drop or ', html.A('Select PDF files')]),
                      style={'width': '90%', 'height': '60px', 'lineHeight': '60px',
                             'borderWidth': '1px', 'borderStyle': 'dashed',
                             'borderRadius': '5px', 'textAlign': 'center', 'margin': '10px'},
                      )

#Table to view output from pdf:
pdf_table = dash_table.DataTable(id='pdf-viewer',
                                 page_action='none',
                                 fixed_rows={'headers': True},
                                 style_table={'height': 500, 'overflowY': 'auto'},
                                 style_header={'overflowY': 'auto'}
                                 )
#Place into the app
app.layout = html.Div([html.H4('Some title'),
                       pdf_load,
                       html.Br(),
                       pdf_table
                       ])


if __name__ == '__main__':
    app.run_server(debug = False)

Output (no formatting applied) should look something like this: Example dash

Upvotes: 2

Related Questions