forked from CloakHQ/CloakBrowser
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlangchain_loader.py
More file actions
51 lines (36 loc) · 1.41 KB
/
langchain_loader.py
File metadata and controls
51 lines (36 loc) · 1.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
"""LangChain + CloakBrowser: load web pages behind bot detection into LangChain Documents.
LangChain's PlaywrightURLLoader hardcodes chromium.launch() with no way to pass
a custom binary. This example uses CloakBrowser directly as a stealth document loader
that produces LangChain Document objects.
Requires: pip install langchain-core cloakbrowser
"""
import asyncio
from langchain_core.documents import Document
from cloakbrowser import launch_async
async def load_urls_stealth(urls: list[str], **launch_kwargs) -> list[Document]:
"""Load URLs using CloakBrowser stealth browser, return LangChain Documents."""
browser = await launch_async(headless=True, **launch_kwargs)
page = await browser.new_page()
docs = []
for url in urls:
await page.goto(url, wait_until="domcontentloaded")
text = await page.evaluate("document.body.innerText")
title = await page.title()
docs.append(Document(
page_content=text,
metadata={"source": url, "title": title},
))
await browser.close()
return docs
async def main():
urls = [
"https://example.com",
"https://httpbin.org/html",
]
docs = await load_urls_stealth(urls)
for doc in docs:
print(f"--- {doc.metadata['title']} ({doc.metadata['source']}) ---")
print(doc.page_content[:300])
print()
if __name__ == "__main__":
asyncio.run(main())