Source code for camelot.handlers

"""Functions to handle all operations on the PDF's."""

from __future__ import annotations

import multiprocessing as mp
import os
from pathlib import Path
from typing import Any

from pdfminer.layout import LTChar
from pdfminer.layout import LTImage
from pdfminer.layout import LTTextLineHorizontal
from pdfminer.layout import LTTextLineVertical
from pypdf import PdfReader
from pypdf import PdfWriter
from pypdf._utils import StrByteType

from .core import TableList
from .parsers import Hybrid
from .parsers import Lattice
from .parsers import Network
from .parsers import Stream
from .utils import TemporaryDirectory
from .utils import download_url
from .utils import get_image_char_and_text_objects
from .utils import get_page_layout
from .utils import get_rotation
from .utils import is_url


PARSERS = {
    "lattice": Lattice,
    "stream": Stream,
    "network": Network,
    "hybrid": Hybrid,
}


[docs] class PDFHandler: """Handles all operations on the PDF's. Handles all operations like temp directory creation, splitting file into single page PDFs, parsing each PDF and then removing the temp directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. password : str, optional (default: None) Password for decryption. debug : bool, optional (default: False) Whether the parser should store debug information during parsing. """ def __init__( self, filepath: StrByteType | Path | str, pages="1", password=None, debug=False, ): self.debug = debug if is_url(filepath): filepath = download_url(str(filepath)) self.filepath: StrByteType | Path | str = filepath if isinstance(filepath, str) and not filepath.lower().endswith(".pdf"): raise NotImplementedError("File format not supported") if password is None: self.password = "" # noqa: S105 else: self.password = password self.pages = self._get_pages(pages) def _get_pages(self, pages): """Convert pages string to list of integers. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. Returns ------- P : list List of int page numbers. """ page_numbers = [] if pages == "1": page_numbers.append({"start": 1, "end": 1}) else: infile = PdfReader(self.filepath, strict=False) if infile.is_encrypted: infile.decrypt(self.password) if pages == "all": page_numbers.append({"start": 1, "end": len(infile.pages)}) else: for r in pages.split(","): if "-" in r: a, b = r.split("-") if b == "end": b = len(infile.pages) page_numbers.append({"start": int(a), "end": int(b)}) else: page_numbers.append({"start": int(r), "end": int(r)}) result = [] for p in page_numbers: result.extend(range(p["start"], p["end"] + 1)) return sorted(set(result)) def _save_page( self, filepath: StrByteType | Path, page: int, temp: str, **layout_kwargs ) -> tuple[ Any, tuple[float, float], list[LTImage], list[LTChar], list[LTTextLineHorizontal], list[LTTextLineVertical], ]: """Saves specified page from PDF into a temporary directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. page : int Page number. temp : str Tmp directory. Returns ------- layout : object dimensions : tuple The dimensions of the pdf page filepath : str The path of the single page PDF - either the original, or a normalized version. """ infile = PdfReader(filepath, strict=False) if infile.is_encrypted: infile.decrypt(self.password) fpath = os.path.join(temp, f"page-{page}.pdf") froot, fext = os.path.splitext(fpath) p = infile.pages[page - 1] outfile = PdfWriter() outfile.add_page(p) with open(fpath, "wb") as f: outfile.write(f) layout, dimensions = get_page_layout(fpath, **layout_kwargs) # fix rotated PDF images, chars, horizontal_text, vertical_text = get_image_char_and_text_objects( layout ) rotation = get_rotation(chars, horizontal_text, vertical_text) if rotation != "": fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) os.rename(fpath, fpath_new) instream = open(fpath_new, "rb") infile = PdfReader(instream, strict=False) if infile.is_encrypted: infile.decrypt(self.password) outfile = PdfWriter() p = infile.pages[0] if rotation == "anticlockwise": p.rotate(90) elif rotation == "clockwise": p.rotate(-90) outfile.add_page(p) with open(fpath, "wb") as f: outfile.write(f) # Only recompute layout and dimension after rotating the pdf layout, dimensions = get_page_layout(fpath, **layout_kwargs) images, chars, horizontal_text, vertical_text = ( get_image_char_and_text_objects(layout) ) instream.close() return layout, dimensions, images, chars, horizontal_text, vertical_text return layout, dimensions, images, chars, horizontal_text, vertical_text
[docs] def parse( self, flavor: str = "lattice", suppress_stdout: bool = False, parallel: bool = False, layout_kwargs: dict[str, Any] | None = None, **kwargs, ): """Extract tables by calling parser.get_tables on all single page PDFs. Parameters ---------- flavor : str (default: 'lattice') The parsing method to use. Lattice is used by default. suppress_stdout : bool (default: False) Suppress logs and warnings. parallel : bool (default: False) Process pages in parallel using all available cpu cores. layout_kwargs : dict, optional (default: {}) A dict of `pdfminer.layout.LAParams <https://pdfminersix.readthedocs.io/en/latest/reference/composable.html#laparams>`_ kwargs. kwargs : dict See camelot.read_pdf kwargs. Returns ------- tables : camelot.core.TableList List of tables found in PDF. """ if layout_kwargs is None: layout_kwargs = {} tables = [] # parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs) parser_obj = PARSERS[flavor] parser = parser_obj(debug=self.debug, **kwargs) with TemporaryDirectory() as tempdir: cpu_count = mp.cpu_count() # Using multiprocessing only when cpu_count > 1 to prevent a stallness issue # when cpu_count is 1 if parallel and len(self.pages) > 1 and cpu_count > 1: with mp.get_context("spawn").Pool(processes=cpu_count) as pool: jobs = [] for p in self.pages: j = pool.apply_async( self._parse_page, (p, tempdir, parser, suppress_stdout, layout_kwargs), ) jobs.append(j) for j in jobs: t = j.get() tables.extend(t) else: for p in self.pages: t = self._parse_page( p, tempdir, parser, suppress_stdout, layout_kwargs ) tables.extend(t) return TableList(sorted(tables))
def _parse_page( self, page: int, tempdir: str, parser, suppress_stdout: bool, layout_kwargs ): """Extract tables by calling parser.get_tables on a single page PDF. Parameters ---------- page : int Page number to parse parser : Lattice, Stream, Network or Hybrid The parser to use. suppress_stdout : bool Suppress logs and warnings. layout_kwargs : dict, optional (default: {}) A dict of `pdfminer.layout.LAParams <https://pdfminersix.readthedocs.io/en/latest/reference/composable.html#laparams>`_ kwargs. Returns ------- tables : camelot.core.TableList List of tables found in PDF. """ layout, dimensions, images, chars, horizontal_text, vertical_text = ( self._save_page(self.filepath, page, tempdir, **layout_kwargs) ) page_path = os.path.join(tempdir, f"page-{page}.pdf") parser.prepare_page_parse( page_path, layout, dimensions, page, images, horizontal_text, vertical_text, layout_kwargs=layout_kwargs, ) tables = parser.extract_tables() return tables