Reputation: 35
here is part of pdf structure:
5 0 obj
<< /Length 56 >>
stream
BT /F1 12 Tf 100 700 Td 15 TL (JavaScript example) Tj ET
endstream
endobj
6 0 obj
<<
/Type /Font
/Subtype /Type1
/Name /F1
/BaseFont /Helvetica
/Encoding /MacRomanEncoding
>>
endobj
7 0 obj
<<
/Type /Action
/S /JavaScript
I want to search for "javascript" if its there or not. the problem with it that javascript can be represented by its hex as a whole or part ot it "javascript or Jav#61Script or J#61v#61Script and so on"
so how could I find out if javascript is exist with all of this possibilities ????
Upvotes: 1
Views: 723
Reputation: 15511
Read it in a character at a time and translate any hex you find to characters as you go, also translating to lowercase. Compare the result to "javascript".
Here's an idea:
import string
import os
import re
def pdf_find_str(pdfname, str):
f = open(pdfname, "rb")
# read the file CHUNK_SIZE chars at a time, keeping last KEEP_SIZE chars
CHUNK_SIZE = 2*1024*1024
KEEP_SIZE = 3 * len(str) # each char might be in #ff form
hexvals = "0123456789abcdef"
ichunk = removed = 0
chunk = f.read(CHUNK_SIZE)
while len(chunk) > 0:
# Loop to find all #'s and replace them with the character they represent.
hpos = chunk.find('#')
while hpos != -1:
if len(chunk)-hpos >= 3 and chunk[hpos+1] in hexvals and chunk[hpos+2] in hexvals:
hex = int(chunk[hpos+1:hpos+3], 16) # next two characters are int value
ch = chr(hex).lower()
if ch in str: # avoid doing this if ch is not in str
chunk = chunk[:hpos] + ch + chunk[hpos+3:]
removed += 2
hpos = chunk.find('#', hpos+1)
m = re.search(str, chunk, re.I)
if m:
return ichunk * (CHUNK_SIZE-KEEP_SIZE) + m.start()
# Transfer last KEEP_SIZE characters to beginning for next round of
# testing since our string may span chunks.
next_chunk = f.read(CHUNK_SIZE - KEEP_SIZE)
if len(next_chunk) == 0: break
chunk = chunk[-KEEP_SIZE:] + next_chunk
ichunk += 1
f.close()
return -1
# On one file:
#if pdf_find_str("Consciousness Explained.pdf", "javascript") != -1:
# print 'Contains "javascript"'
# Recursively on a directory:
for root, dirs, files in os.walk("Books"):
for file in files:
if file.endswith(".pdf"):
position = pdf_find_str(root + "/" + file, "javascript")
if position != -1:
print file, "(", position, ")"
# Note: position returned by pdf_find_str does not account for removed
# characters from #ff representations (if any).
Upvotes: 2