Extract each column as image from scanned pdf

Question

I need to crop each column from scanned pdf. I tried lots of solution from here but none of them worked.

For example I have below image.

original_img

I am using below script to remove horizontal and vertical lines because with those getting each column is not easy for me.

import cv2

SCALE = 4
def show_scaled(name, img):
    try:
        h, w = img.shape
    except ValueError:
        h, w, _ = img.shape
    cv2.imshow(name, cv2.resize(img, (w // SCALE, h // SCALE)))


image = cv2.imread('3-1.png')
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

# Remove horizontal
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 1))
detected_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
show_scaled("horizontal detected lines", detected_lines)
cnts = cv2.findContours(detected_lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
    cv2.drawContours(image, [c], -1, (255, 255, 255), 2)

# Remove vertical
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 15))
detected_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
show_scaled("vertical detected lines", detected_lines)
cnts = cv2.findContours(detected_lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
    cv2.drawContours(image, [c], -1, (255, 255, 255), 2)

# show_scaled('thresh', thresh)
cv2.imshow('image', image)
cv2.imwrite("contours.png", image)
cv2.waitKey()

Result is:

After removing borders I use below script to get each column.

import cv2

SCALE = 4
AREA_THRESHOLD = 427505.0 / 2


def show_scaled(name, img):
    try:
        h, w = img.shape
    except ValueError:
        h, w, _ = img.shape
    cv2.imshow(name, cv2.resize(img, (w // SCALE, h // SCALE)))


def main():
    base_img = cv2.imread("contours.png")
    img = base_img.copy()
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    show_scaled("original", gray)

    # black and white, and inverted, because
    # white pixels are treated as objects in
    # contour detection
    thresholded = cv2.adaptiveThreshold(
        gray, 255,
        cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV,
        25,
        15
    )
    show_scaled('thresholded', thresholded)
    # I use a kernel that is wide enough to connect characters
    # but not text blocks, and tall enough to connect lines.
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 110))
    closing = cv2.morphologyEx(thresholded, cv2.MORPH_CLOSE, kernel)

    contours, hierarchy = cv2.findContours(closing, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    show_scaled("closing", closing)

    for i in range(len(contours)):
        contour = contours[i]
        convex_contour = cv2.convexHull(contour)
        area = cv2.contourArea(convex_contour)
        if area > AREA_THRESHOLD:
            cv2.drawContours(img, [convex_contour], -1, (255, 0, 0), 3)
            [x, y, w, h] = cv2.boundingRect(contour)
            cropped_image = base_img[y:y + h, x:x + w]
            res = cv2.imwrite("/cropped-images/column" + str(i) + ".png", cropped_image)
            print(res)

    show_scaled("contours", img)
    cv2.imwrite("/tmp/contours.png", img)
    cv2.waitKey()


if __name__ == '__main__':
    main()

Result is:

Now there are two problem about what I did. First one is "KULLANMA" text at right upper corner should also removed. I cant basically crop image because it can be smaller or bigger so pixel values between that text and below text can be changed.

Second in second script, opencv also draw contour for header section like in image below.

How can I get only columns images and remove right upper text from image?

Edit:

Script also should support 3,4,6,8 columns pdfs. I tested with 3 column and result is not I expected. I need more generalised solution. Can anyone help me please?

I expect to have these three column has separate contour but result is like below.

Extract each column as image from scanned pdf

Answers (0)

Related Questions