arabic text recognition from pdf using python

Solutions on MaxInterview for arabic text recognition from pdf using python by the best coders in the world

showing results for - "arabic text recognition from pdf using python"
Rémy
24 Jan 2018
1import os
2from os import chdir, getcwd, listdir, path
3import codecs
4import pyPdf
5from time import strftime
6
7def check_path(prompt):
8    ''' (str) -> str
9    Verifies if the provided absolute path does exist.
10    '''
11    abs_path = raw_input(prompt)
12    while path.exists(abs_path) != True:
13        print "\nThe specified path does not exist.\n"
14        abs_path = raw_input(prompt)
15    return abs_path    
16
17print "\n"
18
19folder = check_path("Provide absolute path for the folder: ")
20
21list=[]
22directory=folder
23for root,dirs,files in os.walk(directory):
24    for filename in files:
25        if filename.endswith('.pdf'):
26            t=os.path.join(directory,filename)
27
28            list.append(t)
29
30m=len(list)
31print (m)
32i=0
33while i<=m-1:
34
35    path=list[i]
36    print(path)
37    head,tail=os.path.split(path)
38    var="\\"
39
40    tail=tail.replace(".pdf",".txt")
41    name=head+var+tail
42
43    content = ""
44    # Load PDF into pyPDF
45    pdf = pyPdf.PdfFileReader(file(path, "rb"))
46            # Iterate pages
47    for j in range(0, pdf.getNumPages()):
48        # Extract text from page and add to content
49        content += pdf.getPage(j).extractText() + "\n"
50    print strftime("%H:%M:%S"), " pdf  -> txt "
51    f=open(name,'w')
52    content.encode('utf-8')
53    f.write(content)
54    f.close
55    i=i+1
56