"""IO related functions to Read the PDF and returns extracted tables."""
import os
import warnings
from typing import Any
from .core import TableList
from .handlers import FilepathOrBuffer
from .handlers import PDFHandler
from .utils import TemporaryDirectory
from .utils import remove_extra
from .utils import validate_input
# Minimum vertical + horizontal segment count for the auto-flavor heuristic
# to call a page "ruled". Two lines per axis catches even tiny ruled tables
# (a 2-row 2-col grid produces 3 horizontal + 3 vertical lines including
# borders) while keeping borderless pages with one stray underline accent
# from being mis-classified.
_AUTO_FLAVOR_LINE_THRESHOLD = 2
def _detect_flavor(filepath, password=None, page=1, png_path=None):
"""Pick the most appropriate flavor for a single PDF page.
Renders ``page``, thresholds it, and counts ruled horizontal and
vertical line segments. Used when the caller passes ``flavor="auto"``
(once per requested page, so a document with text-only cover pages and
ruled tables deeper in is routed correctly per page).
Returns
-------
str
Either ``"lattice"`` (enough ruled lines on the rendered page) or
``"network"`` (else). ``"network"`` is also the fallback when
rendering itself fails (e.g. unreadable PDF, missing backend
dependencies) — the assumption is that giving the text-based parser
a chance is more useful than raising before parsing starts.
"""
# Local imports keep \`camelot.read_pdf\` import-time cheap — cv2/playa
# imports already weigh in for the parsers; deferring these for the
# default \`flavor="lattice"\` path costs nothing.
from .backends import ImageConversionBackend
from .image_processing import adaptive_threshold
from .image_processing import find_lines
def _probe(target_png):
# ImageConversionBackend.convert takes (pdf_path, png_path, page); it
# has no `resolution` kwarg — passing one raised TypeError that the
# except below silently swallowed, so 'auto' always fell back to
# 'network'. (#auto-flavor regression)
ImageConversionBackend().convert(str(filepath), target_png, page=page)
if not os.path.exists(target_png):
return None
_, threshold = adaptive_threshold(target_png, process_background=False)
# Use the Lattice default line_scale (15) — picking 40 here excludes
# legitimate small/medium ruled tables.
_, v = find_lines(threshold, direction="vertical", line_scale=15)
_, h = find_lines(threshold, direction="horizontal", line_scale=15)
return len(v), len(h)
try:
if png_path is None:
# No reuse requested: render to a throwaway temp file.
with TemporaryDirectory() as tmpdir:
counts = _probe(os.path.join(tmpdir, "auto_flavor_probe.png"))
else:
# Render to the caller-owned path so the rendered page can be
# reused by the parser (avoids a second render — see _parse_auto).
counts = _probe(png_path)
except Exception:
# Any failure on the probe (no usable backend, encrypted page, broken
# PDF, OpenCV import surprise) is *not* fatal — the user asked us to
# pick, we pick the more forgiving option and let the parser report
# the real error if any.
return "network"
if counts is None:
return "network"
v_count, h_count = counts
has_grid = (
v_count >= _AUTO_FLAVOR_LINE_THRESHOLD
and h_count >= _AUTO_FLAVOR_LINE_THRESHOLD
)
return "lattice" if has_grid else "network"
def _normalize_per_page(per_page):
"""Coerce ``per_page`` to ``{int: dict}`` form, raising ValueError on bad input.
Accepts None / empty (returns ``{}``), int or str keys, and dict
values. Other shapes raise ``ValueError`` with a precise message
naming the offending entry. Values are shallow-copied so a later
in-place edit doesn't mutate the caller's dict.
"""
if per_page is None:
return {}
per_page_norm: dict[int, dict[str, Any]] = {}
for k, v in per_page.items():
try:
page_no = int(k)
except (TypeError, ValueError) as exc:
raise ValueError(f"per_page keys must be page numbers, got {k!r}") from exc
if not isinstance(v, dict):
raise ValueError(
f"per_page[{k!r}] must be a dict of kwargs, got {type(v).__name__}"
)
per_page_norm[page_no] = dict(v)
return per_page_norm
def _validate_per_page(per_page_norm, global_flavor):
"""Validate each per-page override against its effective flavor.
Each entry's flavor is either an explicit override (must be one of
the four concrete flavors — ``"auto"`` doesn't make sense
page-by-page) or the global flavor. All non-flavor kwargs are then
checked with the existing :func:`validate_input` against that
effective flavor.
"""
for page_no, overrides in per_page_norm.items():
page_flavor = overrides.get("flavor", global_flavor)
if page_flavor not in ("lattice", "stream", "network", "hybrid", "ml"):
raise NotImplementedError(
f"per_page[{page_no}] flavor={page_flavor!r} is not"
" one of: 'lattice', 'stream', 'network', 'hybrid', 'ml'."
" ('auto' is only valid as the global flavor.)"
)
page_kwargs = {k: v for k, v in overrides.items() if k != "flavor"}
validate_input(page_kwargs, flavor=page_flavor)
[docs]
def read_pdf(
filepath: FilepathOrBuffer,
pages="1",
password=None,
flavor="lattice",
suppress_stdout=False,
parallel=False,
cpu_count=None,
layout_kwargs=None,
per_page=None,
debug=False,
**kwargs,
):
r"""Read PDF and return extracted tables.
Note: kwargs annotated with ^ can only be used with flavor='stream' or flavor='network'
and kwargs annotated with * can only be used with flavor='lattice'.
The hybrid parser accepts kwargs with both annotations.
Parameters
----------
filepath : str, Path, bytes, or binary file-like
Source PDF. Accepts a filesystem path / URL, a ``bytes``-like
object, or any binary stream with a ``.read()`` method
(``io.BytesIO``, an open ``"rb"`` file, ``requests`` response
``.raw``, etc). For in-memory inputs the bytes are spilled to
a temporary file once and cleaned up on context-manager exit,
so the Lattice OpenCV image-conversion backend keeps working
unchanged. Originally requested in #170 / #245 / #270.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None)
Password for decryption.
flavor : str (default: 'lattice')
The parsing method to use. Valid values:
- ``'lattice'`` (default): line-ruled tables.
- ``'stream'``: borderless tables with whitespace-separated columns.
- ``'network'``: borderless tables via text-edge alignment connectivity.
- ``'hybrid'``: combines layout- and image-based analysis.
- ``'ml'``: neural table-structure recognition (Table Transformer)
for the structure, with cell text filled from the PDF's own text
layer (no hallucinated values). Requires the optional ML
dependencies: ``pip install 'camelot-py[ml]'``. Best for
borderless tables where the heuristic parsers plateau.
- ``'auto'``: detect the flavor **per page** (count ruled lines on
each rendered page) and parse each group accordingly — ruled
pages via ``lattice`` with ``engine='combined'``, the rest via
``network`` — then merge. Handles documents that mix text-only
cover pages with ruled tables deeper in. A ``UserWarning`` reports
the per-page choices. (More accurate but slower, since it renders
every page for the probe.)
suppress_stdout : bool, optional (default: False)
Suppress logs and warnings.
parallel : bool, optional (default: False)
Process pages in parallel using all available cpu cores.
cpu_count : int, optional (default: None)
Maximum number of worker processes when ``parallel=True``. ``None``
(default) uses all available cores. Values are clamped to
``[1, multiprocessing.cpu_count()]``. Ignored when
``parallel=False``.
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams
<https://pdfminersix.readthedocs.io/en/latest/reference/composable.html#laparams>`_ kwargs.
per_page : dict, optional (default: None)
Per-page parameter overrides. Maps a 1-indexed page number (int
or str) to a dict of any keyword argument otherwise valid for
``read_pdf``. Values supplied here override the globally-supplied
kwargs for that one page only — every other page keeps the global
values. Useful for multi-layout PDFs where different pages need
different ``table_areas``, ``columns``, ``flavor``, etc. The
per-page ``flavor`` itself may be overridden; the global flavor
applies otherwise. Originally proposed by @sverma25 in #41.
Example::
tables = camelot.read_pdf(
"report.pdf",
pages="1-3",
flavor="stream",
split_text=True,
per_page={2: {"table_areas": ["120, 210, 400, 90"]}},
)
Here pages 1 and 3 use the global ``flavor="stream", split_text=True``
only; page 2 uses both *and* the page-specific ``table_areas``.
table_areas : list, optional (default: None)
List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
header_text^ : list, optional (default: None)
List of substrings identifying a text line above a stream table.
When ``table_areas`` is not supplied and a matching line is found,
its bottom coordinate becomes the top edge of the derived table
area. If no match is found, Camelot falls back to automatic table
detection.
footer_text^ : list, optional (default: None)
List of substrings identifying a text line below a stream table.
When ``table_areas`` is not supplied and a matching line is found,
its top coordinate becomes the bottom edge of the derived table
area. If no match is found, Camelot falls back to automatic table
detection.
columns^ : list, optional (default: None)
List of column x-coordinates strings where the coordinates
are comma-separated.
split_text : bool, optional (default: False)
Split text that spans across multiple cells.
flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect
super/subscripts. Adds <s></s> around flagged text.
strip_text : str or sequence of str, optional (default: '')
Characters or substrings to strip from each cell before
assignment. A ``str`` strips per-character — every character in
the string is removed wherever it appears (e.g. ``" \n"`` drops
all spaces and newlines). A list/tuple of ``str`` strips whole
substrings (e.g. ``["[1]", "[2]"]`` removes those footnote
markers but leaves bare ``[``/``]`` alone). Whole-substring
mode requested in #484.
replace_text : dict, optional (default: None)
Mapping of substring → replacement applied to every cell's
text just before it is written into the table. Keys are
matched as literal substrings (regex metacharacters are
escaped). Useful for collapsing soft-broken words (e.g.
``{" \n": " "}``), normalising abbreviations, or rewriting
unit names. Distinct from ``strip_text`` which can only
remove characters; this can replace with arbitrary text.
Requested in #481. (#482)
row_tol^ : int, optional (default: 2)
Tolerance parameter used to combine text vertically,
to generate rows.
column_tol^ : int, optional (default: 0)
Tolerance parameter used to combine text horizontally,
to generate columns.
process_background* : bool, optional (default: False)
Process background lines.
line_scale* : int, optional (default: 15)
Line size scaling factor. The larger the value the smaller
the detected lines. Making it very large will lead to text
being detected as lines.
copy_text* : list, optional (default: None)
{'h', 'v'}
Direction in which text in a spanning cell will be copied
over.
shift_text* : list, optional (default: ['l', 't'])
{'l', 'r', 't', 'b'}
Direction in which text in a spanning cell will flow.
line_tol* : int, optional (default: 2)
Tolerance parameter used to merge close vertical and horizontal
lines.
joint_tol* : int, optional (default: 2)
Tolerance parameter used to decide whether the detected lines
and points lie close to each other.
threshold_blocksize* : int, optional (default: 15)
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold
<https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
threshold_constant* : int, optional (default: -2)
Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well.
For more information, refer `OpenCV's adaptiveThreshold
<https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
iterations* : int, optional (default: 0)
Number of dilation passes applied to close small gaps in the
line mask.
For more information, refer `OpenCV's dilate
<https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
erode_iterations* : int, optional (default: 0)
Number of erosion passes applied **after** dilation. Set equal
to ``iterations`` for a morphological closing — bridges gaps
in ruled lines without thickening the mask overall (which
avoids the spurious extra-row artefact reported in #363). (#363)
backend* : str, optional by default "pdfium"
The backend to use for converting the PDF to an image so it can be processed by OpenCV.
use_fallback* : bool, optional
Fallback to another backend if unavailable, by default True
resolution* : int, optional (default: 300)
Resolution used for PDF to PNG conversion.
engine* : str, optional (default: 'combined')
Line-detection engine for ``flavor='lattice'`` (and the lattice
half of ``flavor='hybrid'``):
- ``'combined'`` (default): render the page and detect ruled lines
with OpenCV **and** union in the ruled lines read from the PDF's
native vector graphics, so tables whose rules render faintly
(vector strokes, anti-aliasing) are still found. Safe by
construction — raster always runs, vector lines can only add, and
they're clipped to ``table_regions`` — so it never does worse
than ``'raster'`` (#763).
- ``'raster'``: render the page and detect ruled lines with OpenCV
only — the pre-#763 behaviour.
- ``'vector'``: detect tables straight from the PDF's vector ruled
lines, skipping rasterisation entirely — the fastest path, for
PDFs whose tables are drawn with real vector strokes (#763).
With ``flavor='hybrid'`` the same choices select how its lattice
half finds ruled lines; ``engine='vector'`` there is the
**render-free hybrid** — vector ruled lines merged with the network
text-edge alignment — for partial-ruled / borderless tables at
roughly an order of magnitude less time than the raster path (#39).
Returns
-------
tables : camelot.core.TableList
Notes
-----
**Encrypted PDFs / extraction permissions** (#590). Camelot honours the
``/Encrypt`` dictionary's text-extraction permission: ``read_pdf`` raises
:class:`playa.exceptions.PDFTextExtractionNotAllowed` if the PDF is
encrypted and the user-password permission set forbids text extraction.
The check fires on the document object returned by ``playa.open`` while
the encryption metadata is still attached — this is a real behavioural
change vs the pre-1.0 backend, where per-page temp-PDF splitting
silently dropped the metadata so the check was effectively a no-op.
Note: PDF spec only enforces the flag through the encryption layer —
for **unencrypted** PDFs that carry a "no extraction" claim via
``/Perms``, there is no enforcement mechanism and Camelot extracts.
Supplying the document owner password through ``password=`` bypasses
the user-password permission set (matches every other PDF tool).
Examples
--------
>>> import camelot
>>> tables = camelot.read_pdf("foo.pdf") # xdoctest: +SKIP
>>> tables.n # xdoctest: +SKIP
1
>>> tables[0].df # xdoctest: +SKIP
>>> tables[0].to_csv("foo.csv") # xdoctest: +SKIP
Select a parser and restrict extraction to a page range:
>>> tables = camelot.read_pdf( # xdoctest: +SKIP
... "foo.pdf", flavor="lattice", pages="1-3"
... )
"""
if layout_kwargs is None:
layout_kwargs = {}
if flavor not in ["lattice", "stream", "network", "hybrid", "ml", "auto"]:
raise NotImplementedError(
"Unknown flavor specified."
" Use either 'lattice', 'stream', 'network', 'hybrid', 'ml' or 'auto'"
)
per_page_norm = _normalize_per_page(per_page)
with warnings.catch_warnings():
if suppress_stdout:
warnings.simplefilter("ignore")
with PDFHandler(filepath, pages=pages, password=password, debug=debug) as p:
if flavor == "auto":
return _parse_auto(
p,
kwargs,
suppress_stdout=suppress_stdout,
parallel=parallel,
cpu_count=cpu_count,
layout_kwargs=layout_kwargs,
per_page_norm=per_page_norm,
)
validate_input(kwargs, flavor=flavor)
kwargs = remove_extra(kwargs, flavor=flavor)
_validate_per_page(per_page_norm, flavor)
tables = p.parse(
flavor=flavor,
suppress_stdout=suppress_stdout,
parallel=parallel,
cpu_count=cpu_count,
layout_kwargs=layout_kwargs,
per_page=per_page_norm,
**kwargs,
)
return tables
def _parse_auto(
handler,
kwargs,
*,
suppress_stdout,
parallel,
cpu_count,
layout_kwargs,
per_page_norm,
):
"""Parse with ``flavor='auto'`` — flavor detected **per page**.
Each requested page is probed independently (so a document with a
text-only cover and ruled tables deeper in is routed correctly), then
pages are grouped by detected flavor and parsed in one pass per group:
ruled pages via ``lattice`` with ``engine='combined'`` (the most
accurate detector), the rest via ``network``. Results are merged into a
single page/order-sorted :class:`~camelot.core.TableList`.
The page rendered for each lattice page's probe is kept and handed to
the parser via ``render_cache``, so that page isn't rasterised a second
time during parsing.
"""
with TemporaryDirectory() as cache_dir:
page_flavor = {}
render_cache: dict[int, str] = {}
for pg in handler.pages:
probe_png = os.path.join(cache_dir, f"auto-probe-page-{pg}.png")
fl = _detect_flavor(
handler.filepath,
password=handler.password or None,
page=pg,
png_path=probe_png,
)
page_flavor[pg] = fl
if fl == "lattice" and os.path.exists(probe_png):
render_cache[pg] = probe_png
warnings.warn(
f"camelot.read_pdf: auto-detected per-page flavors {page_flavor}",
UserWarning,
stacklevel=3,
)
collected = []
for fl in ("lattice", "network"):
group_pages = sorted(pg for pg, f in page_flavor.items() if f == fl)
if not group_pages:
continue
group_kwargs = remove_extra(dict(kwargs), flavor=fl)
if fl == "lattice":
# Use the combined raster+vector engine — the strongest
# lattice detector — for the ruled pages auto routes here.
group_kwargs.setdefault("engine", "combined")
_validate_per_page(per_page_norm, fl)
collected.extend(
handler.parse(
flavor=fl,
suppress_stdout=suppress_stdout,
parallel=parallel,
cpu_count=cpu_count,
layout_kwargs=layout_kwargs,
per_page=per_page_norm,
pages=group_pages,
render_cache=render_cache if fl == "lattice" else None,
**group_kwargs,
)
)
collected.sort(key=lambda t: (t.page, t.order))
return TableList(collected)