diff options
| -rw-r--r-- | README.md | 4 | ||||
| -rw-r--r-- | src/tldr/cli.py | 7 | ||||
| -rw-r--r-- | src/tldr/fetcher.py | 61 | ||||
| -rw-r--r-- | src/tldr/providers.py | 5 | ||||
| -rw-r--r-- | tldr.1 | 13 |
5 files changed, 80 insertions, 10 deletions
@@ -22,6 +22,7 @@ tldr <URL> -m <provider/model> | `-p, --proxy` | Proxy URL (e.g., `http://proxy:8080`) | | `-H, --header` | HTTP header (`Name: Value`). Repeatable | | `-n, --nodriver` | Use headless browser for JS-rendered pages | +| `-d, --debug` | Show browser window when using `--nodriver` | | `-r, --raw` | Output raw text without summarization | | `-h, --help` | Show help | @@ -52,6 +53,9 @@ tldr https://example.com -H "Authorization: Bearer token" -m openai/gpt-4o # Raw text output tldr https://example.com --raw + +# Debug mode (show browser window) +tldr https://example.com --nodriver --debug -m claude/sonnet ``` ## Environment Variables diff --git a/src/tldr/cli.py b/src/tldr/cli.py index 7e59350..5d66eda 100644 --- a/src/tldr/cli.py +++ b/src/tldr/cli.py @@ -7,11 +7,12 @@ from .providers import summarize @click.argument("url") @click.option("--proxy", "-p", help="Proxy URL (e.g., http://proxy:8080)") @click.option("--header", "-H", multiple=True, help="HTTP header (format: 'Name: Value')") -@click.option("--nodriver", "-n", is_flag=True, help="Use nodriver (headless browser) to fetch JS-rendered pages") +@click.option("--nodriver", "-n", is_flag=True, help="Use nodriver (headless browser by default) to fetch JS-rendered pages") +@click.option("--debug", "-d", is_flag=True, help="Show browser window when using --nodriver") @click.option("--model", "-m", required=True, help="Model to use (e.g., anthropic/sonnet, openai/gpt-4o, gemini/flash)") @click.option("--raw", "-r", is_flag=True, help="Output raw text instead of summary") def main(url: str, proxy: str | None, header: tuple[str, ...], nodriver: bool, - model: str, raw: bool): + debug: bool, model: str, raw: bool): """Download and summarize a webpage. Example: @@ -33,7 +34,7 @@ def main(url: str, proxy: str | None, header: tuple[str, ...], nodriver: bool, try: click.echo(f"Fetching {url}{' (nodriver)' if nodriver else ''}...", err=True) if nodriver: - html = fetch_page_nodriver(url, proxy=proxy) + html = fetch_page_nodriver(url, proxy=proxy, headless=not debug) else: html = fetch_page(url, proxy=proxy, headers=headers or None) text = html_to_text(html) diff --git a/src/tldr/fetcher.py b/src/tldr/fetcher.py index 26d931f..e03bcb8 100644 --- a/src/tldr/fetcher.py +++ b/src/tldr/fetcher.py @@ -1,5 +1,8 @@ import asyncio +import contextlib import httpx +import os +import sys from bs4 import BeautifulSoup import html2text @@ -19,24 +22,72 @@ def fetch_page(url: str, proxy: str | None = None, headers: dict[str, str] | Non return response.text -def fetch_page_nodriver(url: str, proxy: str | None = None) -> str: +class _FilteredWriter: + def __init__(self, original): + self.original = original + self.skip_lines = 0 + + def write(self, text): + if any(phrase in text for phrase in [ + "successfully removed temp profile", + "Exception ignored in:", + "BaseSubprocessTransport.__del__", + "Event loop is closed", + "Traceback (most recent call last):", + "RuntimeError:", + ".py\", line", + ]): + self.skip_lines = 15 + return + if self.skip_lines > 0: + self.skip_lines -= 1 + return + self.original.write(text) + + def flush(self): + self.original.flush() + + def fileno(self): + return self.original.fileno() + + def __getattr__(self, name): + return getattr(self.original, name) + + +def enable_nodriver_output_filter(): + sys.stdout = _FilteredWriter(sys.__stdout__) + sys.stderr = _FilteredWriter(sys.__stderr__) + + +def fetch_page_nodriver(url: str, proxy: str | None = None, headless: bool = True) -> str: import nodriver as uc + import gc async def _fetch(): browser_args = [] if proxy: browser_args.append(f"--proxy-server={proxy}") - browser = await uc.start(browser_args=browser_args or None) + browser = await uc.start(browser_args=browser_args or None, headless=headless) + html = None try: page = await browser.get(url) await page.wait_for("body", timeout=10) html = await page.get_content() - return html finally: - browser.stop() + try: + if browser: + browser.stop() + await asyncio.sleep(0.05) + except Exception: + pass + return html + + enable_nodriver_output_filter() + result = asyncio.run(_fetch()) + gc.collect() - return asyncio.run(_fetch()) + return result def html_to_text(html: str) -> str: diff --git a/src/tldr/providers.py b/src/tldr/providers.py index 4e2f6b0..338c87b 100644 --- a/src/tldr/providers.py +++ b/src/tldr/providers.py @@ -4,5 +4,8 @@ from multillm import Client def summarize(content: str, url: str, model: str) -> str: client = Client() - prompt = f"Summarize the following webpage content from {url}. Be concise but comprehensive:\n\n{content[:50000]}" + prompt = f"""The content below is from {url}. Summarize ONLY the provided content - do not fetch the URL or use web search. Be concise but comprehensive. + +Content: +{content[:50000]}""" return asyncio.run(client.single(model, prompt)) @@ -23,9 +23,12 @@ Proxy URL for HTTP requests (e.g., http://proxy:8080). HTTP header in "Name: Value" format. Can be specified multiple times. .TP .BR \-n ", " \-\-nodriver -Use nodriver (headless browser) to fetch JavaScript-rendered pages. +Use nodriver (headless browser by default) to fetch JavaScript-rendered pages. Incompatible with \fB\-\-header\fR. Supports \fB\-\-proxy\fR. .TP +.BR \-d ", " \-\-debug +Show browser window when using \fB\-\-nodriver\fR (disables headless mode). +.TP .BR \-r ", " \-\-raw Output raw extracted text instead of LLM summary. .TP @@ -93,6 +96,14 @@ Output raw text without summarization: tldr https://example.com --raw .fi .RE +.PP +Debug mode (show browser window): +.PP +.RS +.nf +tldr https://example.com --nodriver --debug -m claude/sonnet +.fi +.RE .SH SEE ALSO .BR curl (1), .BR wget (1) |
