1# using PyMuPDF
2import sys, fitz
3fname = sys.argv[1] # get document filename
4doc = fitz.open(fname) # open document
5out = open(fname + ".txt", "wb") # open text output
6for page in doc: # iterate the document pages
7 text = page.get_text().encode("utf8") # get plain text (is in UTF-8)
8 out.write(text) # write text of page
9 out.write(bytes((12,))) # write page delimiter (form feed 0x0C)
10out.close()
1# pip install tika
2from tika import parser
3
4raw = parser.from_file('yourfile.pdf')
5print(raw['content'])
6
1import pdfplumber
2
3with pdfplumber.open(r'example.pdf') as pdf:
4 first_page = pdf.pages[0]
5 print(first_page.extract_text())
1import pdfplumberwith pdfplumber.open(r'D:\examplepdf.pdf') as pdf: first_page = pdf.pages[0] print(first_page.extract_text())