aboutsummaryrefslogtreecommitdiffstats
path: root/src/tldr/fetcher.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/tldr/fetcher.py')
-rw-r--r--src/tldr/fetcher.py61
1 files changed, 56 insertions, 5 deletions
diff --git a/src/tldr/fetcher.py b/src/tldr/fetcher.py
index 26d931f..e03bcb8 100644
--- a/src/tldr/fetcher.py
+++ b/src/tldr/fetcher.py
@@ -1,5 +1,8 @@
import asyncio
+import contextlib
import httpx
+import os
+import sys
from bs4 import BeautifulSoup
import html2text
@@ -19,24 +22,72 @@ def fetch_page(url: str, proxy: str | None = None, headers: dict[str, str] | Non
return response.text
-def fetch_page_nodriver(url: str, proxy: str | None = None) -> str:
+class _FilteredWriter:
+ def __init__(self, original):
+ self.original = original
+ self.skip_lines = 0
+
+ def write(self, text):
+ if any(phrase in text for phrase in [
+ "successfully removed temp profile",
+ "Exception ignored in:",
+ "BaseSubprocessTransport.__del__",
+ "Event loop is closed",
+ "Traceback (most recent call last):",
+ "RuntimeError:",
+ ".py\", line",
+ ]):
+ self.skip_lines = 15
+ return
+ if self.skip_lines > 0:
+ self.skip_lines -= 1
+ return
+ self.original.write(text)
+
+ def flush(self):
+ self.original.flush()
+
+ def fileno(self):
+ return self.original.fileno()
+
+ def __getattr__(self, name):
+ return getattr(self.original, name)
+
+
+def enable_nodriver_output_filter():
+ sys.stdout = _FilteredWriter(sys.__stdout__)
+ sys.stderr = _FilteredWriter(sys.__stderr__)
+
+
+def fetch_page_nodriver(url: str, proxy: str | None = None, headless: bool = True) -> str:
import nodriver as uc
+ import gc
async def _fetch():
browser_args = []
if proxy:
browser_args.append(f"--proxy-server={proxy}")
- browser = await uc.start(browser_args=browser_args or None)
+ browser = await uc.start(browser_args=browser_args or None, headless=headless)
+ html = None
try:
page = await browser.get(url)
await page.wait_for("body", timeout=10)
html = await page.get_content()
- return html
finally:
- browser.stop()
+ try:
+ if browser:
+ browser.stop()
+ await asyncio.sleep(0.05)
+ except Exception:
+ pass
+ return html
+
+ enable_nodriver_output_filter()
+ result = asyncio.run(_fetch())
+ gc.collect()
- return asyncio.run(_fetch())
+ return result
def html_to_text(html: str) -> str: