Source code for camelot.io

"""IO related functions to Read the PDF and returns extracted tables."""

import os
import warnings
from typing import Any

from .core import TableList
from .handlers import FilepathOrBuffer
from .handlers import PDFHandler
from .utils import TemporaryDirectory
from .utils import remove_extra
from .utils import validate_input

# Minimum vertical + horizontal segment count for the auto-flavor heuristic
# to call a page "ruled". Two lines per axis catches even tiny ruled tables
# (a 2-row 2-col grid produces 3 horizontal + 3 vertical lines including
# borders) while keeping borderless pages with one stray underline accent
# from being mis-classified.
_AUTO_FLAVOR_LINE_THRESHOLD = 2


def _detect_flavor(filepath, password=None, page=1, png_path=None):
    """Pick the most appropriate flavor for a single PDF page.

    Renders ``page``, thresholds it, and counts ruled horizontal and
    vertical line segments. Used when the caller passes ``flavor="auto"``
    (once per requested page, so a document with text-only cover pages and
    ruled tables deeper in is routed correctly per page).

    Returns
    -------
    str
        Either ``"lattice"`` (enough ruled lines on the rendered page) or
        ``"network"`` (else). ``"network"`` is also the fallback when
        rendering itself fails (e.g. unreadable PDF, missing backend
        dependencies) — the assumption is that giving the text-based parser
        a chance is more useful than raising before parsing starts.
    """
    # Local imports keep \`camelot.read_pdf\` import-time cheap — cv2/playa
    # imports already weigh in for the parsers; deferring these for the
    # default \`flavor="lattice"\` path costs nothing.
    from .backends import ImageConversionBackend
    from .image_processing import adaptive_threshold
    from .image_processing import find_lines

    def _probe(target_png):
        # ImageConversionBackend.convert takes (pdf_path, png_path, page); it
        # has no `resolution` kwarg — passing one raised TypeError that the
        # except below silently swallowed, so 'auto' always fell back to
        # 'network'. (#auto-flavor regression)
        ImageConversionBackend().convert(str(filepath), target_png, page=page)
        if not os.path.exists(target_png):
            return None
        _, threshold = adaptive_threshold(target_png, process_background=False)
        # Use the Lattice default line_scale (15) — picking 40 here excludes
        # legitimate small/medium ruled tables.
        _, v = find_lines(threshold, direction="vertical", line_scale=15)
        _, h = find_lines(threshold, direction="horizontal", line_scale=15)
        return len(v), len(h)

    try:
        if png_path is None:
            # No reuse requested: render to a throwaway temp file.
            with TemporaryDirectory() as tmpdir:
                counts = _probe(os.path.join(tmpdir, "auto_flavor_probe.png"))
        else:
            # Render to the caller-owned path so the rendered page can be
            # reused by the parser (avoids a second render — see _parse_auto).
            counts = _probe(png_path)
    except Exception:
        # Any failure on the probe (no usable backend, encrypted page, broken
        # PDF, OpenCV import surprise) is *not* fatal — the user asked us to
        # pick, we pick the more forgiving option and let the parser report
        # the real error if any.
        return "network"

    if counts is None:
        return "network"
    v_count, h_count = counts
    has_grid = (
        v_count >= _AUTO_FLAVOR_LINE_THRESHOLD
        and h_count >= _AUTO_FLAVOR_LINE_THRESHOLD
    )
    return "lattice" if has_grid else "network"


def _normalize_per_page(per_page):
    """Coerce ``per_page`` to ``{int: dict}`` form, raising ValueError on bad input.

    Accepts None / empty (returns ``{}``), int or str keys, and dict
    values. Other shapes raise ``ValueError`` with a precise message
    naming the offending entry. Values are shallow-copied so a later
    in-place edit doesn't mutate the caller's dict.
    """
    if per_page is None:
        return {}
    per_page_norm: dict[int, dict[str, Any]] = {}
    for k, v in per_page.items():
        try:
            page_no = int(k)
        except (TypeError, ValueError) as exc:
            raise ValueError(f"per_page keys must be page numbers, got {k!r}") from exc
        if not isinstance(v, dict):
            raise ValueError(
                f"per_page[{k!r}] must be a dict of kwargs, got {type(v).__name__}"
            )
        per_page_norm[page_no] = dict(v)
    return per_page_norm


def _validate_per_page(per_page_norm, global_flavor):
    """Validate each per-page override against its effective flavor.

    Each entry's flavor is either an explicit override (must be one of
    the four concrete flavors — ``"auto"`` doesn't make sense
    page-by-page) or the global flavor. All non-flavor kwargs are then
    checked with the existing :func:`validate_input` against that
    effective flavor.
    """
    for page_no, overrides in per_page_norm.items():
        page_flavor = overrides.get("flavor", global_flavor)
        if page_flavor not in ("lattice", "stream", "network", "hybrid", "ml"):
            raise NotImplementedError(
                f"per_page[{page_no}] flavor={page_flavor!r} is not"
                " one of: 'lattice', 'stream', 'network', 'hybrid', 'ml'."
                " ('auto' is only valid as the global flavor.)"
            )
        page_kwargs = {k: v for k, v in overrides.items() if k != "flavor"}
        validate_input(page_kwargs, flavor=page_flavor)


[docs] def read_pdf( filepath: FilepathOrBuffer, pages="1", password=None, flavor="lattice", suppress_stdout=False, parallel=False, cpu_count=None, layout_kwargs=None, per_page=None, debug=False, **kwargs, ): r"""Read PDF and return extracted tables. Note: kwargs annotated with ^ can only be used with flavor='stream' or flavor='network' and kwargs annotated with * can only be used with flavor='lattice'. The hybrid parser accepts kwargs with both annotations. Parameters ---------- filepath : str, Path, bytes, or binary file-like Source PDF. Accepts a filesystem path / URL, a ``bytes``-like object, or any binary stream with a ``.read()`` method (``io.BytesIO``, an open ``"rb"`` file, ``requests`` response ``.raw``, etc). For in-memory inputs the bytes are spilled to a temporary file once and cleaned up on context-manager exit, so the Lattice OpenCV image-conversion backend keeps working unchanged. Originally requested in #170 / #245 / #270. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. password : str, optional (default: None) Password for decryption. flavor : str (default: 'lattice') The parsing method to use. Valid values: - ``'lattice'`` (default): line-ruled tables. - ``'stream'``: borderless tables with whitespace-separated columns. - ``'network'``: borderless tables via text-edge alignment connectivity. - ``'hybrid'``: combines layout- and image-based analysis. - ``'ml'``: neural table-structure recognition (Table Transformer) for the structure, with cell text filled from the PDF's own text layer (no hallucinated values). Requires the optional ML dependencies: ``pip install 'camelot-py[ml]'``. Best for borderless tables where the heuristic parsers plateau. - ``'auto'``: detect the flavor **per page** (count ruled lines on each rendered page) and parse each group accordingly — ruled pages via ``lattice`` with ``engine='combined'``, the rest via ``network`` — then merge. Handles documents that mix text-only cover pages with ruled tables deeper in. A ``UserWarning`` reports the per-page choices. (More accurate but slower, since it renders every page for the probe.) suppress_stdout : bool, optional (default: False) Suppress logs and warnings. parallel : bool, optional (default: False) Process pages in parallel using all available cpu cores. cpu_count : int, optional (default: None) Maximum number of worker processes when ``parallel=True``. ``None`` (default) uses all available cores. Values are clamped to ``[1, multiprocessing.cpu_count()]``. Ignored when ``parallel=False``. layout_kwargs : dict, optional (default: {}) A dict of `pdfminer.layout.LAParams <https://pdfminersix.readthedocs.io/en/latest/reference/composable.html#laparams>`_ kwargs. per_page : dict, optional (default: None) Per-page parameter overrides. Maps a 1-indexed page number (int or str) to a dict of any keyword argument otherwise valid for ``read_pdf``. Values supplied here override the globally-supplied kwargs for that one page only — every other page keeps the global values. Useful for multi-layout PDFs where different pages need different ``table_areas``, ``columns``, ``flavor``, etc. The per-page ``flavor`` itself may be overridden; the global flavor applies otherwise. Originally proposed by @sverma25 in #41. Example:: tables = camelot.read_pdf( "report.pdf", pages="1-3", flavor="stream", split_text=True, per_page={2: {"table_areas": ["120, 210, 400, 90"]}}, ) Here pages 1 and 3 use the global ``flavor="stream", split_text=True`` only; page 2 uses both *and* the page-specific ``table_areas``. table_areas : list, optional (default: None) List of table area strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDF coordinate space. header_text^ : list, optional (default: None) List of substrings identifying a text line above a stream table. When ``table_areas`` is not supplied and a matching line is found, its bottom coordinate becomes the top edge of the derived table area. If no match is found, Camelot falls back to automatic table detection. footer_text^ : list, optional (default: None) List of substrings identifying a text line below a stream table. When ``table_areas`` is not supplied and a matching line is found, its top coordinate becomes the bottom edge of the derived table area. If no match is found, Camelot falls back to automatic table detection. columns^ : list, optional (default: None) List of column x-coordinates strings where the coordinates are comma-separated. split_text : bool, optional (default: False) Split text that spans across multiple cells. flag_size : bool, optional (default: False) Flag text based on font size. Useful to detect super/subscripts. Adds <s></s> around flagged text. strip_text : str or sequence of str, optional (default: '') Characters or substrings to strip from each cell before assignment. A ``str`` strips per-character — every character in the string is removed wherever it appears (e.g. ``" \n"`` drops all spaces and newlines). A list/tuple of ``str`` strips whole substrings (e.g. ``["[1]", "[2]"]`` removes those footnote markers but leaves bare ``[``/``]`` alone). Whole-substring mode requested in #484. replace_text : dict, optional (default: None) Mapping of substring → replacement applied to every cell's text just before it is written into the table. Keys are matched as literal substrings (regex metacharacters are escaped). Useful for collapsing soft-broken words (e.g. ``{" \n": " "}``), normalising abbreviations, or rewriting unit names. Distinct from ``strip_text`` which can only remove characters; this can replace with arbitrary text. Requested in #481. (#482) row_tol^ : int, optional (default: 2) Tolerance parameter used to combine text vertically, to generate rows. column_tol^ : int, optional (default: 0) Tolerance parameter used to combine text horizontally, to generate columns. process_background* : bool, optional (default: False) Process background lines. line_scale* : int, optional (default: 15) Line size scaling factor. The larger the value the smaller the detected lines. Making it very large will lead to text being detected as lines. copy_text* : list, optional (default: None) {'h', 'v'} Direction in which text in a spanning cell will be copied over. shift_text* : list, optional (default: ['l', 't']) {'l', 'r', 't', 'b'} Direction in which text in a spanning cell will flow. line_tol* : int, optional (default: 2) Tolerance parameter used to merge close vertical and horizontal lines. joint_tol* : int, optional (default: 2) Tolerance parameter used to decide whether the detected lines and points lie close to each other. threshold_blocksize* : int, optional (default: 15) Size of a pixel neighborhood that is used to calculate a threshold value for the pixel: 3, 5, 7, and so on. For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. threshold_constant* : int, optional (default: -2) Constant subtracted from the mean or weighted mean. Normally, it is positive but may be zero or negative as well. For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. iterations* : int, optional (default: 0) Number of dilation passes applied to close small gaps in the line mask. For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. erode_iterations* : int, optional (default: 0) Number of erosion passes applied **after** dilation. Set equal to ``iterations`` for a morphological closing — bridges gaps in ruled lines without thickening the mask overall (which avoids the spurious extra-row artefact reported in #363). (#363) backend* : str, optional by default "pdfium" The backend to use for converting the PDF to an image so it can be processed by OpenCV. use_fallback* : bool, optional Fallback to another backend if unavailable, by default True resolution* : int, optional (default: 300) Resolution used for PDF to PNG conversion. engine* : str, optional (default: 'combined') Line-detection engine for ``flavor='lattice'`` (and the lattice half of ``flavor='hybrid'``): - ``'combined'`` (default): render the page and detect ruled lines with OpenCV **and** union in the ruled lines read from the PDF's native vector graphics, so tables whose rules render faintly (vector strokes, anti-aliasing) are still found. Safe by construction — raster always runs, vector lines can only add, and they're clipped to ``table_regions`` — so it never does worse than ``'raster'`` (#763). - ``'raster'``: render the page and detect ruled lines with OpenCV only — the pre-#763 behaviour. - ``'vector'``: detect tables straight from the PDF's vector ruled lines, skipping rasterisation entirely — the fastest path, for PDFs whose tables are drawn with real vector strokes (#763). With ``flavor='hybrid'`` the same choices select how its lattice half finds ruled lines; ``engine='vector'`` there is the **render-free hybrid** — vector ruled lines merged with the network text-edge alignment — for partial-ruled / borderless tables at roughly an order of magnitude less time than the raster path (#39). Returns ------- tables : camelot.core.TableList Notes ----- **Encrypted PDFs / extraction permissions** (#590). Camelot honours the ``/Encrypt`` dictionary's text-extraction permission: ``read_pdf`` raises :class:`playa.exceptions.PDFTextExtractionNotAllowed` if the PDF is encrypted and the user-password permission set forbids text extraction. The check fires on the document object returned by ``playa.open`` while the encryption metadata is still attached — this is a real behavioural change vs the pre-1.0 backend, where per-page temp-PDF splitting silently dropped the metadata so the check was effectively a no-op. Note: PDF spec only enforces the flag through the encryption layer — for **unencrypted** PDFs that carry a "no extraction" claim via ``/Perms``, there is no enforcement mechanism and Camelot extracts. Supplying the document owner password through ``password=`` bypasses the user-password permission set (matches every other PDF tool). Examples -------- >>> import camelot >>> tables = camelot.read_pdf("foo.pdf") # xdoctest: +SKIP >>> tables.n # xdoctest: +SKIP 1 >>> tables[0].df # xdoctest: +SKIP >>> tables[0].to_csv("foo.csv") # xdoctest: +SKIP Select a parser and restrict extraction to a page range: >>> tables = camelot.read_pdf( # xdoctest: +SKIP ... "foo.pdf", flavor="lattice", pages="1-3" ... ) """ if layout_kwargs is None: layout_kwargs = {} if flavor not in ["lattice", "stream", "network", "hybrid", "ml", "auto"]: raise NotImplementedError( "Unknown flavor specified." " Use either 'lattice', 'stream', 'network', 'hybrid', 'ml' or 'auto'" ) per_page_norm = _normalize_per_page(per_page) with warnings.catch_warnings(): if suppress_stdout: warnings.simplefilter("ignore") with PDFHandler(filepath, pages=pages, password=password, debug=debug) as p: if flavor == "auto": return _parse_auto( p, kwargs, suppress_stdout=suppress_stdout, parallel=parallel, cpu_count=cpu_count, layout_kwargs=layout_kwargs, per_page_norm=per_page_norm, ) validate_input(kwargs, flavor=flavor) kwargs = remove_extra(kwargs, flavor=flavor) _validate_per_page(per_page_norm, flavor) tables = p.parse( flavor=flavor, suppress_stdout=suppress_stdout, parallel=parallel, cpu_count=cpu_count, layout_kwargs=layout_kwargs, per_page=per_page_norm, **kwargs, ) return tables
def _parse_auto( handler, kwargs, *, suppress_stdout, parallel, cpu_count, layout_kwargs, per_page_norm, ): """Parse with ``flavor='auto'`` — flavor detected **per page**. Each requested page is probed independently (so a document with a text-only cover and ruled tables deeper in is routed correctly), then pages are grouped by detected flavor and parsed in one pass per group: ruled pages via ``lattice`` with ``engine='combined'`` (the most accurate detector), the rest via ``network``. Results are merged into a single page/order-sorted :class:`~camelot.core.TableList`. The page rendered for each lattice page's probe is kept and handed to the parser via ``render_cache``, so that page isn't rasterised a second time during parsing. """ with TemporaryDirectory() as cache_dir: page_flavor = {} render_cache: dict[int, str] = {} for pg in handler.pages: probe_png = os.path.join(cache_dir, f"auto-probe-page-{pg}.png") fl = _detect_flavor( handler.filepath, password=handler.password or None, page=pg, png_path=probe_png, ) page_flavor[pg] = fl if fl == "lattice" and os.path.exists(probe_png): render_cache[pg] = probe_png warnings.warn( f"camelot.read_pdf: auto-detected per-page flavors {page_flavor}", UserWarning, stacklevel=3, ) collected = [] for fl in ("lattice", "network"): group_pages = sorted(pg for pg, f in page_flavor.items() if f == fl) if not group_pages: continue group_kwargs = remove_extra(dict(kwargs), flavor=fl) if fl == "lattice": # Use the combined raster+vector engine — the strongest # lattice detector — for the ruled pages auto routes here. group_kwargs.setdefault("engine", "combined") _validate_per_page(per_page_norm, fl) collected.extend( handler.parse( flavor=fl, suppress_stdout=suppress_stdout, parallel=parallel, cpu_count=cpu_count, layout_kwargs=layout_kwargs, per_page=per_page_norm, pages=group_pages, render_cache=render_cache if fl == "lattice" else None, **group_kwargs, ) ) collected.sort(key=lambda t: (t.page, t.order)) return TableList(collected)