Reputation: 11
I have a irregular format pdf invoice files with multiple pages. I want excel file in return with data extracted from pdf files. For this I write code with plumberpdf library in python but I am able to propely use regex to read and extracting data. This is due to irregular format.
I want area_name, Date, Tran, Inv # Product, Description, Packing, Rate, Qty, Bonus, Gross, Discount, Sales, Tax Net Amt
. all these columns in excel with missing values as zero. Excel pic is attached for review
this is the file: https://github.com/HananWali/Ak-pharma.git
import re
import requests
import pdfplumber
import pandas as pd
from collections import namedtuple
with pdfplumber.open('AK PHARMA (1).pdf') as pdf:
page = pdf.pages[19]
text = page.extract_text()
Inv = namedtuple('Inv', 'area_name date_p Tran prd_inv prd_des pack rate QTY Bonus Gross Dis Net_Am')
#new_vend_re = re.compile(r'(\b[A-Z]\d{1} [A-Z].*)')
#new_vend_re = re.compile(r'([A-Za-z0-9\s]+)')
#new_vend_re = re.compile(r'[A-Za-z0-9\s]+ (Hospital|Clinic|DHQ|S\d{1,2})', re.IGNORECASE)
#new_vend_re = re.compile(r'\b(?:S\d{1,2}\s)?[A-Za-z]+(?:\s[A-Za-z]+)?\s(?:Station|Hospital|DHQ|[A-Za-z]+)\b', re.IGNORECASE)
new_vend_re = re.compile(r'\b(Chichawatni|Harrapa|Kasowal|Mianchannu|Malka Hans|Gogera|Qabula|Arif Wala|Sahiwal|90 Morh|S1 High Street|S2 Mission Chowk|S3 DHQ Hospital|S4 Farid Town|S5 Jahaz Ground|S7 Karbala Road|S7 Mazdoor Puli|S8 TBZ Colony)\b', re.IGNORECASE)
#new_vend_re = re.compile(r'\d+ \d+ - [A-Za-z\s]+ \([A-Za-z\s]+\)')
for line in text.split('\n'):
if new_vend_re.match(line):
print(line)
area_name = line
print(area_name)
pattren = re.compile(r'(\d{2}-[A-Za-z]{3}-\d{4})\s+(Sale|Return)?\s*(\S+[-\S]+)\s+([A-Za-z0-9\s]+)\s+(\d+[A-Za-z]+)\s+([\d\.]+)\s+(\d+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)')
lines = []
with pdfplumber.open('AK PHARMA (1).pdf') as pdf:
for page in pdf.pages:
text = page.extract_text()
area_name = None
for line in text.split('\n'):
# Check if line matches area name pattern
temp = new_vend_re.search(line)
if temp : # Capture area name once per page
area_name = temp.group(0)
print("Area Name:", area_name)
continue
# Check if line matches the invoice details pattern
line_match = pattren.search(line)
if line_match and area_name:
date_p = line_match.group(0) # Invoice date
Tran = line_match.group(1) # Transaction type (Sale/Return)
prd_inv = line_match.group(2) # Product invoice
prd_des = line_match.group(3) # Product description
pack = line_match.group(4) # Package type
rate = line_match.group(5) # Rate
QTY = line_match.group(6) # Quantity
Bonus = line_match.group(7) # Bonus
Gross = line_match.group(8) # Gross amount
Dis = line_match.group(9) # Discount
Net_Am = line_match.group(10) # Net Amount
# Append each entry as an Inv object
lines.append(Inv(area_name, date_p, Tran, prd_inv, prd_des, pack, rate, QTY, Bonus, Gross, Dis, Net_Am))
# Print results in a more readable format
for inv in lines:
print(inv)
df = pd.DataFrame(lines)
df.head()
df.to_csv('inv_lines.csv')
Upvotes: 1
Views: 88
Reputation: 7250
pdfplumber 0.11.4
Data source: https://github.com/HananWali/Ak-pharma/tree/main
For this particular case, we have a modest number of pages and can visually confirm that there will be no misplacement of empty fields in the future. Therefore, we can use a more carefully designed pattern to extract data in groups. For a more general case, where interpreting data positions with pattern matching is ambiguous, we can split the data into cells using extract_table
with the 'explicit'
strategy for vertical lines and 'text'
for horizontal ones.
The number of pages isn't big, so we can visually confirm that:
'Sales Tax'
'Discount'
With this in mind, we can design the following pattern to catch the data:
import re
data_pattern = re.compile(
r'^(?P<Date>\d{2}-[A-Za-z]{3}-\d{4})\s+'
r'(?P<Tran>[A-Za-z]+)\s+'
r'(?P<Inv>\S+)\s+'
r'(?P<ProductDescription>.*?)\s+'
r'(?P<Packing>\S+)\s+'
r'(?P<Rate>(?:\d+,)?\d+[.]\d{2})\s+'
r'(?P<Qty>[1-9][0-9]*|0)\s+'
r'(?P<Bonus>[1-9][0-9]*|0)\s+'
r'(?P<Gross>\d+[.]\d{2})\s+'
r'(?:(?P<Discount>\d+[.]\d{2})\s+)?'
r'(?:(?P<SalesTax>\d+[.]\d{2})\s+)?'
r'(?P<NetAmt>\d+[.]\d{2})$'
)
The key features:
'(?P<my_group_name>...)'
name the matched group'(?:...)'
don't save this group (we need this to avoid duplicating of '\s+'
as a field separator)Note that this pattern will not work if 'Sales Tax'
is filled in but 'Discount'
is empty.
With this pattern, we can extract the group dictionary from matching lines and append to the area list. Later, pandas
will interpret them correctly:
import pdfplumber
import pandas as pd
from collections import defaultdict
source = '.../AK PHARMA (1).pdf'
lands = {*(
'Chichawatni|Harrapa|Kasowal|Mianchannu|Malka Hans|Gogera|'
'Qabula|Arif Wala|Sahiwal|90 Morh|S1 High Street|'
'S2 Mission Chowk|S3 DHQ Hospital|S4 Farid Town|S5 Jahaz Ground|'
'S7 Karbala Road|S7 Mazdoor Puli|S8 TBZ Colony'
).lower().split('|')}
data = defaultdict(list)
with pdfplumber.open(source) as pdf:
area = None
for page in pdf.pages:
text = page.extract_text().split('\n')
for line in text:
if line.lower() in lands:
area = line
elif (rec:=data_pattern.match(line)):
data[area].append(rec.groupdict())
result = pd.concat([pd.DataFrame(table).assign(Area=key) for key, table in data.items()],
ignore_index=True)
Convert the data type if necessary, and you're good to go. Nevertheless, don't rush to stop with this approach, read to the end. What if there were a line with 'Sales Tax'
but no 'Discount'
? Or what if we missed some subtle pattern variation?
Placing the data correctly using pattern matching won't be easy in a case like the following one:
Gross Disc Tax Net
1572.50 157.25 1415.25
208.25 1041.25 104.13
2937.13 532.30 2661.50
266.15 395.30
When working with PDFs, we can't rely on spacing to position extracted text based on the number of space characters, even when using layout=True
and additional parameters to mimic the original structure.
Also, we might overlook certain pattern details, like a rare comma in numbers (see 'Rate'
on page 4 and further) or a unique empty field in a column (see the line with Inv = 7988-R
on page 6 where 'NetAmt'
is empty; the pattern above will miss it).
In such cases, I believe extract_table is more reliable. The column widths appear to be fixed, which suggests using the 'explicit'
strategy for vertical lines, while the text lines correspond to the table rows, indicating the use of the 'text'
strategy for horizontal lines. Or we can use 'explicit'
for both directions. For example:
import pdfplumber, re
source = ... # a path to the pdf-file
page = pdfplumber.open(source).pages[0]
date_pattern = re.compile('^\d{2}-[A-Z][a-z]{2}-\d{4}')
explicit_horizontal_lines = []
records = iter(page.extract_text_lines())
for rec in records:
if date_pattern.match(rec['text']):
explicit_horizontal_lines.append(rec['top'])
explicit_horizontal_lines.append(rec['bottom'])
break
for rec in records:
if date_pattern.match(rec['text']):
explicit_horizontal_lines.append(rec['bottom'])
else:
break
explicit_vertical_lines = [20.1, 67.7, 100.7, 138.4, 263.0, 295.9, 328.8,
365.2, 400.8, 447.7, 486.9, 526.8, 575.9]
bbox = [explicit_vertical_lines[0],
explicit_horizontal_lines[0],
explicit_vertical_lines[-1],
explicit_horizontal_lines[-1]]
table_settings = {
'vertical_strategy': 'explicit'
, 'horizontal_strategy': 'explicit'
, 'explicit_vertical_lines': explicit_vertical_lines
, 'explicit_horizontal_lines': explicit_horizontal_lines
}
(
page.crop(bbox)
.to_image(resolution=200)
.debug_tablefinder(table_settings)
.show()
)
Step 1. Deduce the vertical line positions
pdf = pdfplumber.open(source)
page = pdf.pages[0]
# find the line with captions
for headline in page.extract_text_lines():
if headline['text'].startswith('Date'):
break
captions = headline['text'].replace(' ', '')
where = lambda word, border: (
headline['chars'][captions.find(word) + (len(word)-1 if border == 'x1' else 0)][border]
)
# deduce vertical lines positions
explicit_vertical_lines = [
where('Date', 'x0')
, where('Tran', 'x0')
, (where('Tran', 'x1') + where('Inv', 'x0')) / 2 # may need adjustment
, where('Product', 'x0')
, where('Packing', 'x0')
, (where('Packing', 'x0') + where('Rate', 'x1')) / 2 # may need adjustment
, where('Rate', 'x1')
, where('Qty', 'x1')
, where('Bonus', 'x1')
, where('Gross', 'x1')
, where('Discount', 'x1')
, where('SalesTax', 'x1')
, where('NetAmt.', 'x1')
]
# bounding box = [left, top, right, bottom]
bbox = [headline['x0'], None, headline['x1'], None]
Step 2. Extract tables for each land
explicit_horizontal_lines = []
table_settings = {
'vertical_strategy': 'explicit'
, 'horizontal_strategy': 'explicit'
, 'explicit_vertical_lines': explicit_vertical_lines
, 'explicit_horizontal_lines': explicit_horizontal_lines
}
date_pattern = re.compile('^\d{2}-[A-Z][a-z]{2}-\d{4}')
area = None
captions = result_1.columns.drop('Area')
data = defaultdict(list)
for page in pdf.pages:
records = iter(page.extract_text_lines())
while True:
explicit_horizontal_lines[:] = []
for rec in records:
if rec['text'].lower() in lands:
area = rec['text']
elif date_pattern.match(rec['text']):
explicit_horizontal_lines.append(rec['top'])
explicit_horizontal_lines.append(rec['bottom'])
break
else:
break # EXIT THE WHILE LOOP
for rec in records:
if date_pattern.match(rec['text']):
explicit_horizontal_lines.append(rec['bottom'])
else:
break
bbox[1] = explicit_horizontal_lines[0]
bbox[3] = explicit_horizontal_lines[-1]
table = page.crop(bbox).extract_table(table_settings)
data[area].append(pd.DataFrame(table, columns=captions))
Step 3. Collect the data
result = pd.concat(
[pd.concat(tables).assign(Area=area) for area, tables in data.items()]
, ignore_index=True)
In some cases, pattern matching might seem useful at first glance. However, in large documents, subtle pattern variations can be overlooked, leading to missing data. Therefore, using extract_table
is often more reliable, especially when imaginary lines can be deduced in structured text.
Upvotes: 1