using textract aws pdf example

Solutions on MaxInterview for using textract aws pdf example by the best coders in the world

showing results for - "using textract aws pdf example"
Zach
22 Aug 2016
1import boto3
2import time
3
4def startJob(s3BucketName, objectName):
5    response = None
6    client = boto3.client('textract')
7    response = client.start_document_text_detection(
8    DocumentLocation={
9        'S3Object': {
10            'Bucket': s3BucketName,
11            'Name': objectName
12        }
13    })
14
15    return response["JobId"]
16
17def isJobComplete(jobId):
18    # For production use cases, use SNS based notification 
19    # Details at: https://docs.aws.amazon.com/textract/latest/dg/api-async.html
20    time.sleep(5)
21    client = boto3.client('textract')
22    response = client.get_document_text_detection(JobId=jobId)
23    status = response["JobStatus"]
24    print("Job status: {}".format(status))
25
26    while(status == "IN_PROGRESS"):
27        time.sleep(5)
28        response = client.get_document_text_detection(JobId=jobId)
29        status = response["JobStatus"]
30        print("Job status: {}".format(status))
31
32    return status
33
34def getJobResults(jobId):
35
36    pages = []
37
38    client = boto3.client('textract')
39    response = client.get_document_text_detection(JobId=jobId)
40    
41    pages.append(response)
42    print("Resultset page recieved: {}".format(len(pages)))
43    nextToken = None
44    if('NextToken' in response):
45        nextToken = response['NextToken']
46
47    while(nextToken):
48
49        response = client.get_document_text_detection(JobId=jobId, NextToken=nextToken)
50
51        pages.append(response)
52        print("Resultset page recieved: {}".format(len(pages)))
53        nextToken = None
54        if('NextToken' in response):
55            nextToken = response['NextToken']
56
57    return pages
58
59# Document
60s3BucketName = "ki-textract-demo-docs"
61documentName = "Amazon-Textract-Pdf.pdf"
62
63jobId = startJob(s3BucketName, documentName)
64print("Started job with id: {}".format(jobId))
65if(isJobComplete(jobId)):
66    response = getJobResults(jobId)
67
68#print(response)
69
70# Print detected text
71for resultPage in response:
72    for item in resultPage["Blocks"]:
73        if item["BlockType"] == "LINE":
74            print ('\033[94m' +  item["Text"] + '\033[0m')
similar questions
queries leading to this page
using textract aws pdf example