1import boto3
2import time
3
4def startJob(s3BucketName, objectName):
5 response = None
6 client = boto3.client('textract')
7 response = client.start_document_text_detection(
8 DocumentLocation={
9 'S3Object': {
10 'Bucket': s3BucketName,
11 'Name': objectName
12 }
13 })
14
15 return response["JobId"]
16
17def isJobComplete(jobId):
18 # For production use cases, use SNS based notification
19 # Details at: https://docs.aws.amazon.com/textract/latest/dg/api-async.html
20 time.sleep(5)
21 client = boto3.client('textract')
22 response = client.get_document_text_detection(JobId=jobId)
23 status = response["JobStatus"]
24 print("Job status: {}".format(status))
25
26 while(status == "IN_PROGRESS"):
27 time.sleep(5)
28 response = client.get_document_text_detection(JobId=jobId)
29 status = response["JobStatus"]
30 print("Job status: {}".format(status))
31
32 return status
33
34def getJobResults(jobId):
35
36 pages = []
37
38 client = boto3.client('textract')
39 response = client.get_document_text_detection(JobId=jobId)
40
41 pages.append(response)
42 print("Resultset page recieved: {}".format(len(pages)))
43 nextToken = None
44 if('NextToken' in response):
45 nextToken = response['NextToken']
46
47 while(nextToken):
48
49 response = client.get_document_text_detection(JobId=jobId, NextToken=nextToken)
50
51 pages.append(response)
52 print("Resultset page recieved: {}".format(len(pages)))
53 nextToken = None
54 if('NextToken' in response):
55 nextToken = response['NextToken']
56
57 return pages
58
59# Document
60s3BucketName = "ki-textract-demo-docs"
61documentName = "Amazon-Textract-Pdf.pdf"
62
63jobId = startJob(s3BucketName, documentName)
64print("Started job with id: {}".format(jobId))
65if(isJobComplete(jobId)):
66 response = getJobResults(jobId)
67
68#print(response)
69
70# Print detected text
71for resultPage in response:
72 for item in resultPage["Blocks"]:
73 if item["BlockType"] == "LINE":
74 print ('\033[94m' + item["Text"] + '\033[0m')