diff options
Diffstat (limited to 'src/tldr/fetcher.py')
| -rw-r--r-- | src/tldr/fetcher.py | 61 |
1 files changed, 56 insertions, 5 deletions
diff --git a/src/tldr/fetcher.py b/src/tldr/fetcher.py index 26d931f..e03bcb8 100644 --- a/src/tldr/fetcher.py +++ b/src/tldr/fetcher.py @@ -1,5 +1,8 @@ import asyncio +import contextlib import httpx +import os +import sys from bs4 import BeautifulSoup import html2text @@ -19,24 +22,72 @@ def fetch_page(url: str, proxy: str | None = None, headers: dict[str, str] | Non return response.text -def fetch_page_nodriver(url: str, proxy: str | None = None) -> str: +class _FilteredWriter: + def __init__(self, original): + self.original = original + self.skip_lines = 0 + + def write(self, text): + if any(phrase in text for phrase in [ + "successfully removed temp profile", + "Exception ignored in:", + "BaseSubprocessTransport.__del__", + "Event loop is closed", + "Traceback (most recent call last):", + "RuntimeError:", + ".py\", line", + ]): + self.skip_lines = 15 + return + if self.skip_lines > 0: + self.skip_lines -= 1 + return + self.original.write(text) + + def flush(self): + self.original.flush() + + def fileno(self): + return self.original.fileno() + + def __getattr__(self, name): + return getattr(self.original, name) + + +def enable_nodriver_output_filter(): + sys.stdout = _FilteredWriter(sys.__stdout__) + sys.stderr = _FilteredWriter(sys.__stderr__) + + +def fetch_page_nodriver(url: str, proxy: str | None = None, headless: bool = True) -> str: import nodriver as uc + import gc async def _fetch(): browser_args = [] if proxy: browser_args.append(f"--proxy-server={proxy}") - browser = await uc.start(browser_args=browser_args or None) + browser = await uc.start(browser_args=browser_args or None, headless=headless) + html = None try: page = await browser.get(url) await page.wait_for("body", timeout=10) html = await page.get_content() - return html finally: - browser.stop() + try: + if browser: + browser.stop() + await asyncio.sleep(0.05) + except Exception: + pass + return html + + enable_nodriver_output_filter() + result = asyncio.run(_fetch()) + gc.collect() - return asyncio.run(_fetch()) + return result def html_to_text(html: str) -> str: |
