项目 github 地址,docker 搭建很简单
docker run -p 6000:3000 ghcr.io/browserless/chrome
对应的 docs 位于:
http://ip:6000/docs
然后在 Browser REST APIs
下面可以找到对应的 api, 如:http://ip:6000/chrome/content
.
python 简单的请求实现:
import requests | |
# 定义要访问的地址 | |
url = 'http://ip:6000/chrome/content' | |
# 准备发送的数据 | |
data = {"url": base_url} | |
# 发送 POST 请求 | |
response = requests.post(url, json=data) | |
# 检查响应状态码 | |
if response.status_code == 200: | |
# 如果请求成功,打印响应内容 | |
print(response.text) | |
else: | |
# 如果请求失败,打印错误信息 | |
print(f'Error: {response.status_code}') |
封装为函数:
def scrape(url: str): | |
# scrape website, and also will summarize the content based on objective if the content is too large | |
# objective is the original objective & task that user give to the agent, url is the url of the website to be scraped | |
print("Scraping website...") | |
# Define the headers for the request | |
headers = { | |
'Cache-Control': 'no-cache', | |
'Content-Type': 'application/json', | |
} | |
# Define the data to be sent in the request | |
data = {"url": url} | |
# Convert Python object to JSON string | |
data_json = json.dumps(data) | |
# Send the POST request | |
post_url = "http://ip:6000/chrome/content" | |
response = requests.post(post_url, headers=headers, data=data_json) | |
# Check the response status code | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.content, "html.parser") | |
text = soup.get_text() | |
print("CONTENT:", text) | |
if len(text) > 8000: | |
output = summary(text) | |
return output | |
else: | |
return text | |
else: | |
print(f"HTTP request failed with status code {response.status_code}") |
正文完
发表至: NLP
2024-02-22