Facing irregular format while extracting data from pdf invoice to transfer in excel file

Question

I have a irregular format pdf invoice files with multiple pages. I want excel file in return with data extracted from pdf files. For this I write code with plumberpdf library in python but I am able to propely use regex to read and extracting data. This is due to irregular format.

I want area_name, Date, Tran, Inv # Product, Description, Packing, Rate, Qty, Bonus, Gross, Discount, Sales, Tax Net Amt. all these columns in excel with missing values as zero. Excel pic is attached for review enter image description here this is the file: https://github.com/HananWali/Ak-pharma.git

import re
import requests
import pdfplumber
import pandas as pd
from collections import namedtuple

with pdfplumber.open('AK PHARMA  (1).pdf') as pdf:
    page = pdf.pages[19]
    text = page.extract_text()
Inv = namedtuple('Inv', 'area_name date_p Tran prd_inv prd_des pack rate QTY Bonus Gross Dis Net_Am')

#new_vend_re = re.compile(r'(\b[A-Z]\d{1} [A-Z].*)')
#new_vend_re = re.compile(r'([A-Za-z0-9\s]+)')
#new_vend_re = re.compile(r'[A-Za-z0-9\s]+ (Hospital|Clinic|DHQ|S\d{1,2})', re.IGNORECASE)
#new_vend_re = re.compile(r'\b(?:S\d{1,2}\s)?[A-Za-z]+(?:\s[A-Za-z]+)?\s(?:Station|Hospital|DHQ|[A-Za-z]+)\b', re.IGNORECASE)
new_vend_re = re.compile(r'\b(Chichawatni|Harrapa|Kasowal|Mianchannu|Malka Hans|Gogera|Qabula|Arif Wala|Sahiwal|90 Morh|S1 High Street|S2 Mission Chowk|S3 DHQ Hospital|S4 Farid Town|S5 Jahaz Ground|S7 Karbala Road|S7 Mazdoor Puli|S8 TBZ Colony)\b', re.IGNORECASE)
#new_vend_re = re.compile(r'\d+ \d+ - [A-Za-z\s]+ $[A-Za-z\s]+$')

for line in text.split('\n'):
    if new_vend_re.match(line):
        print(line)
        area_name = line
        print(area_name)
pattren = re.compile(r'(\d{2}-[A-Za-z]{3}-\d{4})\s+(Sale|Return)?\s*(\S+[-\S]+)\s+([A-Za-z0-9\s]+)\s+(\d+[A-Za-z]+)\s+([\d\.]+)\s+(\d+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)')
lines = []

with pdfplumber.open('AK PHARMA  (1).pdf') as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        area_name = None 
        
        for line in text.split('\n'):
            # Check if line matches area name pattern
            temp = new_vend_re.search(line)
            if temp :  # Capture area name once per page
                area_name = temp.group(0)
                print("Area Name:", area_name)
                continue

            # Check if line matches the invoice details pattern
            line_match = pattren.search(line)
            if line_match and area_name:
                date_p = line_match.group(0)  # Invoice date
                Tran = line_match.group(1)     # Transaction type (Sale/Return)
                prd_inv = line_match.group(2)  # Product invoice
                prd_des = line_match.group(3)  # Product description
                pack = line_match.group(4)     # Package type
                rate = line_match.group(5)     # Rate
                QTY = line_match.group(6)      # Quantity
                Bonus = line_match.group(7)    # Bonus
                Gross = line_match.group(8)    # Gross amount
                Dis = line_match.group(9)     # Discount
                Net_Am = line_match.group(10)  # Net Amount
                
                # Append each entry as an Inv object
                lines.append(Inv(area_name, date_p, Tran, prd_inv, prd_des, pack, rate, QTY, Bonus, Gross, Dis, Net_Am))

# Print results in a more readable format
for inv in lines:
    print(inv)
df = pd.DataFrame(lines)
df.head()
df.to_csv('inv_lines.csv')

Facing irregular format while extracting data from pdf invoice to transfer in excel file

Answers (1)

Extracting structured but fragmented text using pdfplumber

TL;DR

Extracting tables using pattern matching

Collecting data using extract_table

More accurate and detailed calculations

Summary

Related Questions