showing results for - "pdfjs get all the text present"
Marisol
19 Jan 2020
1import PDFJS from "pdfjs-dist";
2import PDFJSWorker from "pdfjs-dist/build/pdf.worker.js"; // add this to fit 2.3.0
3
4PDFJS.disableTextLayer = true;
5PDFJS.disableWorker = true; // not availaible anymore since 2.3.0 (see imports)
6
7const getPageText = async (pdf: Pdf, pageNo: number) => {
8  const page = await pdf.getPage(pageNo);
9  const tokenizedText = await page.getTextContent();
10  const pageText = tokenizedText.items.map(token => token.str).join("");
11  return pageText;
12};
13
14/* see example of a PDFSource below */
15export const getPDFText = async (source: PDFSource): Promise<string> => {
16  Object.assign(window, {pdfjsWorker: PDFJSWorker}); // added to fit 2.3.0
17  const pdf: Pdf = await PDFJS.getDocument(source).promise;
18  const maxPages = pdf.numPages;
19  const pageTextPromises = [];
20  for (let pageNo = 1; pageNo <= maxPages; pageNo += 1) {
21    pageTextPromises.push(getPageText(pdf, pageNo));
22  }
23  const pageTexts = await Promise.all(pageTextPromises);
24  return pageTexts.join(" ");
25};
26