欢迎阅读 Crawl4AI 快速入门指南!在本教程中,我们将以友好而幽默的语气向您介绍 Crawl4AI 的基本用法。我们将介绍从基本用法到分块和提取策略等高级功能的所有内容,所有这些都具有异步编程的强大功能。让我们开始吧!🌟
工具简介安装指南参考:
入门指南
首先,让我们导入必要的模块并创建一个实例AsyncWebCrawler
。我们将使用异步上下文管理器,它为我们处理爬虫的设置和拆卸。
import asyncio
from crawl4ai import AsyncWebCrawler
async def main():
async with AsyncWebCrawler(verbose=True) as crawler:
# We'll add our crawling code here
pass
if __name__ == "__main__":
asyncio.run(main())
基本用法
只需提供一个 URL,让 Crawl4AI 发挥神奇的作用!
async def main():
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(url="https://www.nbcnews.com/business")
print(f"Basic crawl result: {result.markdown[:500]}") # Print first 500 characters
asyncio.run(main())
截屏
让我们对页面进行截图吧!
import base64
async def main():
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(url="https://www.nbcnews.com/business", screenshot=True)
with open("screenshot.png", "wb") as f:
f.write(base64.b64decode(result.screenshot))
print("Screenshot saved to 'screenshot.png'!")
asyncio.run(main())
理解参数
默认情况下,Crawl4AI 会缓存您的抓取结果。这意味着对同一 URL 的后续抓取将更快!让我们来看看实际效果。
async def main():
async with AsyncWebCrawler(verbose=True) as crawler:
# First crawl (caches the result)
result1 = await crawler.arun(url="https://www.nbcnews.com/business")
print(f"First crawl result: {result1.markdown[:100]}...")
# Force to crawl again
result2 = await crawler.arun(url="https://www.nbcnews.com/business", bypass_cache=True)
print(f"Second crawl result: {result2.markdown[:100]}...")
asyncio.run(main())
添加分块策略
让我们添加一个分块策略:RegexChunking
!此策略根据给定的正则表达式模式拆分文本。
from crawl4ai.chunking_strategy import RegexChunking
async def main():
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
chunking_strategy=RegexChunking(patterns=["\n\n"])
)
print(f"RegexChunking result: {result.extracted_content[:200]}...")
asyncio.run(main())
添加提取策略
让我们通过提取策略变得更聪明:JsonCssExtractionStrategy
!此策略使用 CSS 选择器从 HTML 中提取结构化数据。
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
import json
async def main():
schema = {
"name": "News Articles",
"baseSelector": "article.tease-card",
"fields": [
{
"name": "title",
"selector": "h2",
"type": "text",
},
{
"name": "summary",
"selector": "div.tease-card__info",
"type": "text",
}
],
}
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True)
)
extracted_data = json.loads(result.extracted_content)
print(f"Extracted {len(extracted_data)} articles")
print(json.dumps(extracted_data[0], indent=2))
asyncio.run(main())
使用 LLMExtractionStrategy
是时候拿出重磅武器了:LLMExtractionStrategy
!此策略使用大型语言模型从网页中提取相关信息。
from crawl4ai.extraction_strategy import LLMExtractionStrategy
import os
from pydantic import BaseModel, Field
class OpenAIModelFee(BaseModel):
model_name: str = Field(..., description="Name of the OpenAI model.")
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
async def main():
if not os.getenv("OPENAI_API_KEY"):
print("OpenAI API key not found. Skipping this example.")
return
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url="https://openai.com/api/pricing/",
word_count_threshold=1,
extraction_strategy=LLMExtractionStrategy(
provider="openai/gpt-4o",
api_token=os.getenv("OPENAI_API_KEY"),
schema=OpenAIModelFee.schema(),
extraction_type="schema",
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
Do not miss any models in the entire content. One extracted model JSON format should look like this:
{"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""",
),
bypass_cache=True,
)
print(result.extracted_content)
asyncio.run(main())
交互式提取🖱️
让我们在提取之前使用 JavaScript 与页面进行交互!
async def main():
js_code = """
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
loadMoreButton && loadMoreButton.click();
"""
wait_for = """() => {
return Array.from(document.querySelectorAll('article.tease-card')).length > 10;
}"""
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
js_code=js_code,
wait_for=wait_for,
css_selector="article.tease-card",
bypass_cache=True,
)
print(f"JavaScript interaction result: {result.extracted_content[:500]}")
asyncio.run(main())
基于会话的高级动态内容爬取
在现代 Web 应用程序中,内容通常是动态加载的,无需更改 URL。这在单页应用程序 (SPA) 或使用无限滚动的网站中很常见。依赖 URL 更改的传统抓取方法在这里不起作用。这就是 Crawl4AI 的高级基于会话的抓取派上用场的地方!
这种方法的强大之处在于:
- 会话保存:通过使用
session_id
,我们可以在与页面的多次交互中保持抓取会话的状态。这对于浏览动态加载的内容至关重要。 - 异步 JavaScript 执行:我们可以执行自定义 JavaScript 来触发内容加载或导航。在此示例中,我们将点击“加载更多”按钮来获取下一页的提交。
- 动态内容等待:该
wait_for
参数允许我们指定在认为页面加载完成之前必须满足的条件。这确保我们不会在新内容完全加载之前提取数据。
让我们通过一个真实示例来了解其工作原理:抓取 GitHub 存储库中的多页提交。加载更多提交时,URL 不会改变,因此我们将使用这些高级技术来导航和提取数据。
import json
from bs4 import BeautifulSoup
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
async def main():
async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://github.com/microsoft/TypeScript/commits/main"
session_id = "typescript_commits_session"
all_commits = []
js_next_page = """
const button = document.querySelector('a[data-testid="pagination-next-button"]');
if (button) button.click();
"""
wait_for = """() => {
const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
if (commits.length === 0) return false;
const firstCommit = commits[0].textContent.trim();
return firstCommit !== window.lastCommit;
}"""
schema = {
"name": "Commit Extractor",
"baseSelector": "li.Box-sc-g0xbh4-0",
"fields": [
{
"name": "title",
"selector": "h4.markdown-title",
"type": "text",
"transform": "strip",
},
],
}
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
for page in range(3): # Crawl 3 pages
result = await crawler.arun(
url=url,
session_id=session_id,
css_selector="li.Box-sc-g0xbh4-0",
extraction_strategy=extraction_strategy,
js_code=js_next_page if page > 0 else None,
wait_for=wait_for if page > 0 else None,
js_only=page > 0,
bypass_cache=True,
headless=False,
)
assert result.success, f"Failed to crawl page {page + 1}"
commits = json.loads(result.extracted_content)
all_commits.extend(commits)
print(f"Page {page + 1}: Found {len(commits)} commits")
await crawler.crawler_strategy.kill_session(session_id)
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
asyncio.run(main())
在此示例中,我们从 GitHub 存储库中抓取了多页提交内容。加载更多提交内容时,URL 不会发生变化,因此我们使用 JavaScript 点击“加载更多”按钮,并设置wait_for
条件以确保在提取之前加载新内容。这种强大的组合使我们能够轻松地从复杂的动态加载 Web 应用程序中导航和提取数据!
恭喜!🎉
您已完成 Crawl4AI 快速入门指南!现在开始像专业人士一样异步抓取网页吧!🕸️