aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/tldr/cli.py7
-rw-r--r--src/tldr/fetcher.py61
-rw-r--r--src/tldr/providers.py5
3 files changed, 64 insertions, 9 deletions
diff --git a/src/tldr/cli.py b/src/tldr/cli.py
index 7e59350..5d66eda 100644
--- a/src/tldr/cli.py
+++ b/src/tldr/cli.py
@@ -7,11 +7,12 @@ from .providers import summarize
@click.argument("url")
@click.option("--proxy", "-p", help="Proxy URL (e.g., http://proxy:8080)")
@click.option("--header", "-H", multiple=True, help="HTTP header (format: 'Name: Value')")
-@click.option("--nodriver", "-n", is_flag=True, help="Use nodriver (headless browser) to fetch JS-rendered pages")
+@click.option("--nodriver", "-n", is_flag=True, help="Use nodriver (headless browser by default) to fetch JS-rendered pages")
+@click.option("--debug", "-d", is_flag=True, help="Show browser window when using --nodriver")
@click.option("--model", "-m", required=True, help="Model to use (e.g., anthropic/sonnet, openai/gpt-4o, gemini/flash)")
@click.option("--raw", "-r", is_flag=True, help="Output raw text instead of summary")
def main(url: str, proxy: str | None, header: tuple[str, ...], nodriver: bool,
- model: str, raw: bool):
+ debug: bool, model: str, raw: bool):
"""Download and summarize a webpage.
Example:
@@ -33,7 +34,7 @@ def main(url: str, proxy: str | None, header: tuple[str, ...], nodriver: bool,
try:
click.echo(f"Fetching {url}{' (nodriver)' if nodriver else ''}...", err=True)
if nodriver:
- html = fetch_page_nodriver(url, proxy=proxy)
+ html = fetch_page_nodriver(url, proxy=proxy, headless=not debug)
else:
html = fetch_page(url, proxy=proxy, headers=headers or None)
text = html_to_text(html)
diff --git a/src/tldr/fetcher.py b/src/tldr/fetcher.py
index 26d931f..e03bcb8 100644
--- a/src/tldr/fetcher.py
+++ b/src/tldr/fetcher.py
@@ -1,5 +1,8 @@
import asyncio
+import contextlib
import httpx
+import os
+import sys
from bs4 import BeautifulSoup
import html2text
@@ -19,24 +22,72 @@ def fetch_page(url: str, proxy: str | None = None, headers: dict[str, str] | Non
return response.text
-def fetch_page_nodriver(url: str, proxy: str | None = None) -> str:
+class _FilteredWriter:
+ def __init__(self, original):
+ self.original = original
+ self.skip_lines = 0
+
+ def write(self, text):
+ if any(phrase in text for phrase in [
+ "successfully removed temp profile",
+ "Exception ignored in:",
+ "BaseSubprocessTransport.__del__",
+ "Event loop is closed",
+ "Traceback (most recent call last):",
+ "RuntimeError:",
+ ".py\", line",
+ ]):
+ self.skip_lines = 15
+ return
+ if self.skip_lines > 0:
+ self.skip_lines -= 1
+ return
+ self.original.write(text)
+
+ def flush(self):
+ self.original.flush()
+
+ def fileno(self):
+ return self.original.fileno()
+
+ def __getattr__(self, name):
+ return getattr(self.original, name)
+
+
+def enable_nodriver_output_filter():
+ sys.stdout = _FilteredWriter(sys.__stdout__)
+ sys.stderr = _FilteredWriter(sys.__stderr__)
+
+
+def fetch_page_nodriver(url: str, proxy: str | None = None, headless: bool = True) -> str:
import nodriver as uc
+ import gc
async def _fetch():
browser_args = []
if proxy:
browser_args.append(f"--proxy-server={proxy}")
- browser = await uc.start(browser_args=browser_args or None)
+ browser = await uc.start(browser_args=browser_args or None, headless=headless)
+ html = None
try:
page = await browser.get(url)
await page.wait_for("body", timeout=10)
html = await page.get_content()
- return html
finally:
- browser.stop()
+ try:
+ if browser:
+ browser.stop()
+ await asyncio.sleep(0.05)
+ except Exception:
+ pass
+ return html
+
+ enable_nodriver_output_filter()
+ result = asyncio.run(_fetch())
+ gc.collect()
- return asyncio.run(_fetch())
+ return result
def html_to_text(html: str) -> str:
diff --git a/src/tldr/providers.py b/src/tldr/providers.py
index 4e2f6b0..338c87b 100644
--- a/src/tldr/providers.py
+++ b/src/tldr/providers.py
@@ -4,5 +4,8 @@ from multillm import Client
def summarize(content: str, url: str, model: str) -> str:
client = Client()
- prompt = f"Summarize the following webpage content from {url}. Be concise but comprehensive:\n\n{content[:50000]}"
+ prompt = f"""The content below is from {url}. Summarize ONLY the provided content - do not fetch the URL or use web search. Be concise but comprehensive.
+
+Content:
+{content[:50000]}"""
return asyncio.run(client.single(model, prompt))