Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 29 additions & 112 deletions cookbook/company-info/scrapegraph_sdk.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"id": "jEkuKbcRrPcK"
},
"source": [
"## 🕷️ Extract Company Info with Official Scrapegraph SDK\n"
"## \ud83d\udd77\ufe0f Extract Company Info with Official Scrapegraph SDK\n"
]
},
{
Expand All @@ -24,7 +24,7 @@
"id": "IzsyDXEWwPVt"
},
"source": [
"### 🔧 Install `dependencies`"
"### \ud83d\udd27 Install `dependencies`"
]
},
{
Expand All @@ -45,7 +45,7 @@
"id": "apBsL-L2KzM7"
},
"source": [
"### 🔑 Import `ScrapeGraph` API key"
"### \ud83d\udd11 Import `ScrapeGraph` API key"
]
},
{
Expand All @@ -54,7 +54,7 @@
"id": "ol9gQbAFkh9b"
},
"source": [
"You can find the Scrapegraph API key [here](https://dashboard.scrapegraphai.com/)"
"You can find the Scrapegraph API key [here](https://scrapegraphai.com/dashboard)"
]
},
{
Expand Down Expand Up @@ -83,7 +83,7 @@
"output_type": "stream",
"text": [
"SGAI_API_KEY not found in environment.\n",
"Please enter your SGAI_API_KEY: ··········\n",
"Please enter your SGAI_API_KEY: \u00b7\u00b7\u00b7\u00b7\u00b7\u00b7\u00b7\u00b7\u00b7\u00b7\n",
"SGAI_API_KEY has been set in the environment.\n"
]
}
Expand All @@ -102,7 +102,7 @@
"id": "jnqMB2-xVYQ7"
},
"source": [
"### 📝 Defining an `Output Schema` for Webpage Content Extraction\n"
"### \ud83d\udcdd Defining an `Output Schema` for Webpage Content Extraction\n"
]
},
{
Expand Down Expand Up @@ -237,7 +237,7 @@
"id": "cDGH0b2DkY63"
},
"source": [
"### 🚀 Initialize `SGAI Client` and start extraction"
"### \ud83d\ude80 Initialize `SGAI Client` and start extraction"
]
},
{
Expand All @@ -246,7 +246,7 @@
"id": "4SLJgXgcob6L"
},
"source": [
"Initialize the client for scraping (there's also an async version [here](https://github.com/ScrapeGraphAI/scrapegraph-sdk/blob/main/scrapegraph-py/examples/async_smartscraper_example.py))"
"Initialize the client for scraping (an async version using `AsyncScrapeGraphAI` is available [here](https://github.com/ScrapeGraphAI/scrapegraph-py/blob/main/examples/extract/extract_basic_async.py))."
]
},
{
Expand All @@ -257,10 +257,9 @@
},
"outputs": [],
"source": [
"from scrapegraph_py import Client\n",
"from scrapegraph_py import ScrapeGraphAI\n",
"\n",
"# Initialize the client with explicit API key\n",
"sgai_client = Client(api_key=sgai_api_key)"
"sgai_client = ScrapeGraphAI()"
]
},
{
Expand All @@ -269,13 +268,7 @@
"id": "M1KSXffZopUD"
},
"source": [
"Here we use `Smartscraper` service to extract structured data using AI from a webpage.\n",
"\n",
"\n",
"> If you already have an HTML file, you can upload it and use `Localscraper` instead.\n",
"\n",
"\n",
"\n"
"Use the `extract` method to pull structured data from a URL with AI. The same method also accepts raw `html=` or `markdown=` if you already have the page content."
]
},
{
Expand All @@ -286,11 +279,10 @@
},
"outputs": [],
"source": [
"# Request for Trending Repositories\n",
"repo_response = sgai_client.smartscraper(\n",
" website_url=\"https://scrapegraphai.com/\",\n",
" user_prompt=\"Extract info about the company\",\n",
" output_schema=CompanyInfoSchema,\n",
"repo_response = sgai_client.extract(\n",
" \"Extract info about the company\",\n",
" url=\"https://scrapegraphai.com/\",\n",
" schema=CompanyInfoSchema.model_json_schema(),\n",
")"
]
},
Expand Down Expand Up @@ -323,91 +315,16 @@
"id": "F1VfD8B4LPc8",
"outputId": "8d7b2955-1569-4b3a-8ffe-014a8442dd12"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Request ID: 87a7ea1a-9dd4-4d1d-ae76-b419ead57c11\n",
"Company Info:\n",
"{\n",
" \"company_name\": \"ScrapeGraphAI\",\n",
" \"description\": \"ScrapeGraphAI is a powerful AI scraping API designed for efficient web data extraction to power LLM applications and AI agents. It enables developers to perform intelligent AI scraping and extract structured information from websites using advanced AI techniques.\",\n",
" \"founders\": [\n",
" {\n",
" \"name\": \"\",\n",
" \"role\": \"Founder & Technical Lead\",\n",
" \"linkedin\": \"https://www.linkedin.com/in/perinim/\"\n",
" },\n",
" {\n",
" \"name\": \"Marco Vinciguerra\",\n",
" \"role\": \"Founder & Software Engineer\",\n",
" \"linkedin\": \"https://www.linkedin.com/in/marco-vinciguerra-7ba365242/\"\n",
" },\n",
" {\n",
" \"name\": \"Lorenzo Padoan\",\n",
" \"role\": \"Founder & Product Engineer\",\n",
" \"linkedin\": \"https://www.linkedin.com/in/lorenzo-padoan-4521a2154/\"\n",
" }\n",
" ],\n",
" \"logo\": \"https://scrapegraphai.com/images/scrapegraphai_logo.svg\",\n",
" \"partners\": [\n",
" \"PostHog\",\n",
" \"AWS\",\n",
" \"NVIDIA\",\n",
" \"JinaAI\",\n",
" \"DagWorks\",\n",
" \"Browserbase\",\n",
" \"ScrapeDo\",\n",
" \"HackerNews\",\n",
" \"Medium\",\n",
" \"HackADay\"\n",
" ],\n",
" \"pricing_plans\": [\n",
" {\n",
" \"tier\": \"Free\",\n",
" \"price\": \"$0\",\n",
" \"credits\": 100\n",
" },\n",
" {\n",
" \"tier\": \"Starter\",\n",
" \"price\": \"$20/month\",\n",
" \"credits\": 5000\n",
" },\n",
" {\n",
" \"tier\": \"Growth\",\n",
" \"price\": \"$100/month\",\n",
" \"credits\": 40000\n",
" },\n",
" {\n",
" \"tier\": \"Pro\",\n",
" \"price\": \"$500/month\",\n",
" \"credits\": 250000\n",
" }\n",
" ],\n",
" \"contact_emails\": [\n",
" \"contact@scrapegraphai.com\"\n",
" ],\n",
" \"social_links\": {\n",
" \"linkedin\": \"https://www.linkedin.com/company/101881123\",\n",
" \"twitter\": \"https://x.com/scrapegraphai\",\n",
" \"github\": \"https://github.com/ScrapeGraphAI/Scrapegraph-ai\"\n",
" },\n",
" \"privacy_policy\": \"https://scrapegraphai.com/privacy\",\n",
" \"terms_of_service\": \"https://scrapegraphai.com/terms\",\n",
" \"api_status\": \"https://scrapegraphapi.openstatus.dev\"\n",
"}\n"
]
}
],
"outputs": [],
"source": [
"import json\n",
"\n",
"# Print the response\n",
"request_id = repo_response['request_id']\n",
"result = repo_response['result']\n",
"if repo_response.status != \"success\":\n",
" raise RuntimeError(repo_response.error)\n",
"\n",
"result = repo_response.data.json_data\n",
"\n",
"print(f\"Request ID: {request_id}\")\n",
"print(\"Tokens used:\", repo_response.data.usage)\n",
"print(\"Company Info:\")\n",
"print(json.dumps(result, indent=2))"
]
Expand All @@ -418,7 +335,7 @@
"id": "2as65QLypwdb"
},
"source": [
"### 💾 Save the output to a `CSV` file"
"### \ud83d\udcbe Save the output to a `CSV` file"
]
},
{
Expand Down Expand Up @@ -1883,7 +1800,7 @@
"id": "-1SZT8VzTZNd"
},
"source": [
"## 🔗 Resources"
"## \ud83d\udd17 Resources"
]
},
{
Expand All @@ -1893,13 +1810,13 @@
},
"source": [
"\n",
"- 🚀 **Get your API Key:** [ScrapeGraphAI Dashboard](https://dashboard.scrapegraphai.com) \n",
"- 🐙 **GitHub:** [ScrapeGraphAI GitHub](https://github.com/scrapegraphai) \n",
"- 💼 **LinkedIn:** [ScrapeGraphAI LinkedIn](https://www.linkedin.com/company/scrapegraphai/) \n",
"- 🐦 **Twitter:** [ScrapeGraphAI Twitter](https://twitter.com/scrapegraphai) \n",
"- 💬 **Discord:** [Join our Discord Community](https://discord.gg/uJN7TYcpNa) \n",
"- \ud83d\ude80 **Get your API Key:** [ScrapeGraphAI Dashboard](https://scrapegraphai.com/dashboard) \n",
"- \ud83d\udc19 **GitHub:** [ScrapeGraphAI GitHub](https://github.com/scrapegraphai) \n",
"- \ud83d\udcbc **LinkedIn:** [ScrapeGraphAI LinkedIn](https://www.linkedin.com/company/scrapegraphai/) \n",
"- \ud83d\udc26 **Twitter:** [ScrapeGraphAI Twitter](https://twitter.com/scrapegraphai) \n",
"- \ud83d\udcac **Discord:** [Join our Discord Community](https://discord.gg/uJN7TYcpNa) \n",
"\n",
"Made with ❤️ by the [ScrapeGraphAI](https://scrapegraphai.com) Team \n"
"Made with \u2764\ufe0f by the [ScrapeGraphAI](https://scrapegraphai.com) Team \n"
]
}
],
Expand Down
Loading