SilkPage¶

The central object returned by every fetch operation.

SilkPage ¶

SilkPage(html: str, *, url: str = '', status: int = 200, headers: dict[str, str] | None = None, metadata: dict[str, Any] | None = None, fetch_tier: int = 0)

Source code in silkweb/parse/page.py

def __init__(
    self,
    html: str,
    *,
    url: str = "",
    status: int = 200,
    headers: dict[str, str] | None = None,
    metadata: dict[str, Any] | None = None,
    fetch_tier: int = 0,
) -> None:
    self.html: str = html
    self.url: str = url
    self.status: int = status
    self.headers: dict[str, str] = headers or {}
    self.fetch_tier: int = fetch_tier

    try:
        self._lxml_root = (
            lxml_html.fromstring(html) if html else lxml_html.fromstring("<html/>")
        )
    except (ValueError, etree.ParserError, etree.XMLSyntaxError):
        # e.g. XML sitemap bodies with ``<?xml ...?>`` are not valid HTML for lxml.html
        self._lxml_root = lxml_html.fromstring("<html/>")

    self.metadata: dict[str, Any] = metadata or self._extract_metadata()
    self.text: str = self._extract_text()
    self.markdown: str = self._extract_markdown()

xpath ¶

xpath(expr: str, *, kind: Literal['elements'] = 'elements') -> list[SilkElement]

xpath(expr: str, *, kind: Literal['values']) -> list[Any]

xpath(expr: str, *, kind: Literal['elements', 'values'] = 'elements') -> list[Any]

Run an XPath expression against the page root.

kind="elements" returns SilkElement wrappers (default). Use for node paths.
kind="values" returns raw values (e.g. //@href, /text()), not elements.

Source code in silkweb/parse/page.py

def xpath(self, expr: str, *, kind: Literal["elements", "values"] = "elements") -> list[Any]:
    """
    Run an XPath expression against the page root.

    - `kind="elements"` returns `SilkElement` wrappers (default). Use for node paths.
    - `kind="values"` returns raw values (e.g. `//@href`, `/text()`), not elements.
    """
    try:
        results = self._lxml_root.xpath(expr)
    except Exception:
        return []

    if kind == "values":
        return list(results) if isinstance(results, list) else [results]

    if not isinstance(results, list):
        return []
    return [SilkElement(r) for r in results if isinstance(r, etree._Element)]

links ¶

links(*, external: bool | None = None) -> list[str]

Return all links as absolute URLs.

Parameters:

Name	Type	Description	Default
`external`	`bool \| None`	None returns all links, True returns only external, False returns only internal (same-domain).	`None`

Source code in silkweb/parse/page.py

def links(self, *, external: bool | None = None) -> list[str]:
    """
    Return all <a href> links as absolute URLs.

    Args:
        external: None returns all links, True returns only external,
                  False returns only internal (same-domain).
    """
    hrefs = self._lxml_root.xpath("//a[@href]/@href")
    parsed_base = urlparse(self.url) if self.url else None
    out: list[str] = []
    for href in hrefs:
        if not isinstance(href, str):
            continue
        abs_url = urljoin(self.url or "", href)
        if external is None or parsed_base is None:
            out.append(abs_url)
        else:
            is_ext = bool(
                urlparse(abs_url).netloc and urlparse(abs_url).netloc != parsed_base.netloc
            )
            if (external and is_ext) or (not external and not is_ext):
                out.append(abs_url)
    return out

network_requests ¶

network_requests() -> list[dict[str, Any]]

Return captured network events (browser tiers only, when enabled).

This is populated by tier 2/3 fetchers when capture_network=True.

Source code in silkweb/parse/page.py

def network_requests(self) -> list[dict[str, Any]]:
    """
    Return captured network events (browser tiers only, when enabled).

    This is populated by tier 2/3 fetchers when `capture_network=True`.
    """
    val = getattr(self, "_network_log", None)
    return list(val) if isinstance(val, list) else []

hydration_source ¶

hydration_source() -> str | None

Which hydration script produced JSON, if any (before JSON parse).

Source code in silkweb/parse/page.py

def hydration_source(self) -> str | None:
    """Which hydration script produced JSON, if any (before JSON parse)."""
    next_data = self._lxml_root.xpath("//script[@id='__NEXT_DATA__']/text()")
    for s in next_data:
        if isinstance(s, str) and s.strip():
            parsed = _safe_json_loads(s.strip())
            if isinstance(parsed, dict):
                return "__NEXT_DATA__"
    nuxt_tagged = self._lxml_root.xpath("//script[@id='__NUXT_DATA__']/text()")
    for s in nuxt_tagged:
        if isinstance(s, str) and s.strip():
            parsed = _safe_json_loads(s.strip())
            if isinstance(parsed, dict):
                return "__NUXT_DATA__"
    scripts = self._lxml_root.xpath("//script/text()")
    for s in scripts:
        if not isinstance(s, str):
            continue
        m = re.search(r"__NUXT__\s*=\s*(\{.*\})\s*;?\s*$", s.strip(), flags=re.DOTALL)
        if m:
            parsed = _safe_json_loads(m.group(1))
            if isinstance(parsed, dict):
                return "__NUXT__"
    return None

detect_records ¶

detect_records() -> list[dict[str, Any]]

Heuristic repeated-record detection (no LLM).

For now: find the most repeated (tag, class) among elements under , and turn each into a small record dict.

Source code in silkweb/parse/page.py

def detect_records(self) -> list[dict[str, Any]]:
    """
    Heuristic repeated-record detection (no LLM).

    For now: find the most repeated (tag, class) among elements under <body>,
    and turn each into a small record dict.
    """
    body = self._lxml_root.xpath("//body")
    if not body or not isinstance(body[0], etree._Element):
        return []
    body_el = body[0]

    buckets: dict[tuple[str, str], list[etree._Element]] = {}
    for el in body_el.iterdescendants():
        if not isinstance(el, etree._Element):
            continue
        cls = (el.get("class") or "").strip()
        if not cls:
            continue
        key = (el.tag, cls)
        buckets.setdefault(key, []).append(el)

    best: list[etree._Element] = []
    for els in buckets.values():
        if len(els) >= 2 and len(els) > len(best):
            best = els

    records: list[dict[str, Any]] = []
    for el in best:
        wrapper = SilkElement(el)
        link_el = el.xpath(".//a[@href]")
        href = None
        if link_el and isinstance(link_el[0], etree._Element):
            href = link_el[0].get("href")
        records.append(
            {
                "text": wrapper.text,
                "xpath": wrapper.xpath,
                "url": urljoin(self.url or "", href) if href else None,
            }
        )
    return records

SilkElement ¶

SilkElement(element: _Element)

Source code in silkweb/parse/page.py

def __init__(self, element: etree._Element):
    self._el = element

SilkMeta `dataclass` ¶

SilkMeta(url: str, fetched_at: datetime, fetch_tier: int, xpath: str, llm_model: str | None = None, selector_from_cache: bool | None = None, confidence: float | None = None)

SilkPage¶

SilkPage ¶

xpath ¶

links ¶

network_requests ¶

hydration_source ¶

detect_records ¶

SilkElement ¶

SilkMeta dataclass ¶

SilkMeta `dataclass` ¶