Skip to content

Add changelog, add e2e tests, update tools, remove WebUnblocker #16

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/lint_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,4 @@ jobs:

- name: Run tests
run: |
uv run pytest --cov=src --cov-report xml --cov-report term --cov-fail-under=90 ./tests
uv run pytest --cov=src --cov-report xml --cov-report term --cov-fail-under=90 tests/unit tests/integration
4 changes: 2 additions & 2 deletions .github/workflows/publish_to_pypi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ name: Publish Python 🐍 distributions 📦 to PyPI

on:
push:
branches: [ "main" ]

tags:
- 'v[0-9]+.[0-9]+.[0-9]+'
jobs:
build-n-publish:
name: Build and publish Python distribution to PyPI
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ ipython_config.py
__pypackages__/

# Environments
.env
*/.env
.venv
env/
venv/
Expand Down
20 changes: 20 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Changelog

## [0.2.0] - 2025-05-13

### Added

- Changelog
- E2E tests
- Geolocation and User Agent type parameters to universal scraper

### Changed

- Descriptions for tools
- Descriptions for tool parameters
- Default values for tool parameters

### Removed

- WebUnblocker tool
- Parse parameter for universal scraper
11 changes: 8 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ virtualenv_dir ?= .venv

.PHONY: install_deps
install_deps: $(virtualenv_dir)
uv sync
uv sync --group dev

.PHONY: lint
lint: install_deps
Expand All @@ -22,11 +22,16 @@ format: $(virtualenv_dir)

.PHONY: test
test: install_deps
uv run pytest --cov=src --cov-report xml --cov-report term --cov-fail-under=90 ./tests
uv run pytest --cov=src --cov-report xml --cov-report term --cov-fail-under=90 tests/unit tests/integration

.PHONY: test-e2e
test-e2e:
uv sync --group dev --group e2e-tests
uv run pytest --cov=src --cov-report xml --cov-report term tests/e2e

.PHONY: run
run: install_deps
npx @modelcontextprotocol/inspector@0.3.0 \
npx @modelcontextprotocol/inspector \
uv \
--directory $(current_dir) \
run \
Expand Down
10 changes: 0 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -239,16 +239,6 @@ make run
```
Then access MCP Inspector at `http://localhost:5173`. You may need to add your username and password as environment variables in the inspector under `OXYLABS_USERNAME` and `OXYLABS_PASSWORD`.


## 🛠️ Technical Details

This server provides two main tools:

1. **oxylabs_scraper**: Uses Oxylabs Web Scraper API for general website scraping
2. **oxylabs_web_unblocker**: Uses Oxylabs Web Unblocker for hard-to-access websites

[Web Scraper API](https://oxylabs.io/products/scraper-api/web) supports JavaScript rendering, parsed structured data, and cleaned HTML in Markdown format. [Web Unblocker](https://oxylabs.io/products/web-unblocker) offers JavaScript rendering and cleaned HTML, but doesn’t return parsed data.

---

## License
Expand Down
14 changes: 11 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "oxylabs-mcp"
version = "0.1.7"
version = "0.2.0"
description = "Oxylabs MCP server"
authors = [
{name="Augis Braziunas", email="[email protected]"},
Expand All @@ -24,7 +24,7 @@ dependencies = [
"lxml>=5.3.0",
"lxml-html-clean>=0.4.1",
"markdownify>=0.14.1",
"mcp[cli]>=1.6.0",
"mcp[cli]>=1.8.0",
"pydantic>=2.10.5",
"pydantic-settings>=2.8.1",
]
Expand All @@ -40,6 +40,12 @@ dev = [
"pytest-mock>=3.14.0",
"ruff>=0.9.1",
]
e2e-tests = [
"agno>=1.4.5",
"anthropic>=0.50.0",
"google-genai>=1.13.0",
"openai>=1.77.0",
]

[build-system]
requires = ["hatchling"]
Expand Down Expand Up @@ -89,7 +95,8 @@ lint.ignore = [
]

[tool.ruff.lint.per-file-ignores]
"tests/*" = ["D", "S101", "ARG001", "ANN", "PT011", "FBT"]
"tests/*" = ["D", "S101", "ARG001", "ANN", "PT011", "FBT", "PLR2004"]
"src/oxylabs_mcp/url_params.py" = ["E501"]

[tool.ruff.lint.pycodestyle]
max-line-length = 100
Expand All @@ -100,6 +107,7 @@ lines-after-imports = 2

[tool.pytest.ini_options]
asyncio_default_fixture_loop_scope = "session"
asyncio_mode = "auto"

[tool.black]
line-length = 100
124 changes: 53 additions & 71 deletions src/oxylabs_mcp/server.py
Original file line number Diff line number Diff line change
@@ -1,83 +1,53 @@
from typing import Any

from mcp.server.fastmcp import Context, FastMCP
from mcp.types import ToolAnnotations

from oxylabs_mcp import url_params
from oxylabs_mcp.config import settings
from oxylabs_mcp.exceptions import MCPServerError
from oxylabs_mcp.utils import (
convert_html_to_md,
get_content,
oxylabs_client,
strip_html,
)
from oxylabs_mcp.utils import get_content, oxylabs_client


mcp = FastMCP("oxylabs_mcp", dependencies=["mcp", "httpx"])
mcp = FastMCP("oxylabs_mcp")


@mcp.tool(
name="oxylabs_universal_scraper",
description="Scrape url using Oxylabs Web API with universal scraper",
)
async def scrape_universal_url(
@mcp.tool(annotations=ToolAnnotations(readOnlyHint=True))
async def universal_scraper(
ctx: Context, # type: ignore[type-arg]
url: url_params.URL_PARAM,
parse: url_params.PARSE_PARAM = False, # noqa: FBT002
render: url_params.RENDER_PARAM = "",
user_agent_type: url_params.USER_AGENT_TYPE_PARAM = "",
geo_location: url_params.GEO_LOCATION_PARAM = "",
output_format: url_params.OUTPUT_FORMAT_PARAM = "",
) -> str:
"""Scrape url using Oxylabs Web API with universal scraper."""
"""Get a content of any webpage.

Supports browser rendering, parsing of certain webpages
and different output formats.
"""
try:
async with oxylabs_client(ctx, with_auth=True) as client:
async with oxylabs_client(ctx) as client:
payload: dict[str, Any] = {"url": url}
if parse:
payload["parse"] = parse

if render:
payload["render"] = render
if user_agent_type:
payload["user_agent_type"] = user_agent_type
if geo_location:
payload["geo_location"] = geo_location

response = await client.post(settings.OXYLABS_SCRAPER_URL, json=payload)

response.raise_for_status()

return get_content(response, parse)
except MCPServerError as e:
return e.stringify()


@mcp.tool(
name="oxylabs_web_unblocker",
description="Scrape url using Oxylabs Web Unblocker",
)
async def scrape_with_web_unblocker(
ctx: Context, # type: ignore[type-arg]
url: url_params.URL_PARAM,
render: url_params.RENDER_PARAM = "",
) -> str:
"""Scrape url using Oxylabs Web Unblocker.

This tool manages the unblocking process to extract public data
even from the most difficult websites.
"""
headers: dict[str, Any] = {}
if render:
headers["X-Oxylabs-Render"] = render

try:
async with oxylabs_client(ctx, with_proxy=True, verify=False, headers=headers) as client:
response = await client.get(url)

response.raise_for_status()

return convert_html_to_md(strip_html(response.text))
return get_content(response, output_format=output_format)
except MCPServerError as e:
return e.stringify()


@mcp.tool(
name="oxylabs_google_search_scraper",
description="Scrape Google Search results using Oxylabs Web API",
)
async def scrape_google_search(
@mcp.tool(annotations=ToolAnnotations(readOnlyHint=True))
async def google_search_scraper(
ctx: Context, # type: ignore[type-arg]
query: url_params.GOOGLE_QUERY_PARAM,
parse: url_params.PARSE_PARAM = True, # noqa: FBT002
Expand All @@ -90,10 +60,15 @@ async def scrape_google_search(
geo_location: url_params.GEO_LOCATION_PARAM = "",
locale: url_params.LOCALE_PARAM = "",
ad_mode: url_params.AD_MODE_PARAM = False, # noqa: FBT002
output_format: url_params.OUTPUT_FORMAT_PARAM = "",
) -> str:
"""Scrape Google Search results using Oxylabs Web API."""
"""Scrape Google Search results.

Supports content parsing, different user agent types, pagination,
domain, geolocation, locale parameters and different output formats.
"""
try:
async with oxylabs_client(ctx, with_auth=True) as client:
async with oxylabs_client(ctx) as client:
payload: dict[str, Any] = {"query": query}

if ad_mode:
Expand Down Expand Up @@ -124,16 +99,13 @@ async def scrape_google_search(

response.raise_for_status()

return get_content(response, parse)
return get_content(response, parse=parse, output_format=output_format)
except MCPServerError as e:
return e.stringify()


@mcp.tool(
name="oxylabs_amazon_search_scraper",
description="Scrape Amazon Search results using Oxylabs Web API",
)
async def scrape_amazon_search(
@mcp.tool(annotations=ToolAnnotations(readOnlyHint=True))
async def amazon_search_scraper(
ctx: Context, # type: ignore[type-arg]
query: url_params.AMAZON_SEARCH_QUERY_PARAM,
category_id: url_params.CATEGORY_ID_CONTEXT_PARAM = "",
Expand All @@ -147,10 +119,16 @@ async def scrape_amazon_search(
domain: url_params.DOMAIN_PARAM = "",
geo_location: url_params.GEO_LOCATION_PARAM = "",
locale: url_params.LOCALE_PARAM = "",
output_format: url_params.OUTPUT_FORMAT_PARAM = "",
) -> str:
"""Scrape Amazon Search results using Oxylabs Web API."""
"""Scrape Amazon search results.

Supports content parsing, different user agent types, pagination,
domain, geolocation, locale parameters and different output formats.
Supports Amazon specific parameters such as category id, merchant id, currency.
"""
try:
async with oxylabs_client(ctx, with_auth=True) as client:
async with oxylabs_client(ctx) as client:
payload: dict[str, Any] = {"source": "amazon_search", "query": query}

context = []
Expand Down Expand Up @@ -184,16 +162,13 @@ async def scrape_amazon_search(

response.raise_for_status()

return get_content(response, parse)
return get_content(response, parse=parse, output_format=output_format)
except MCPServerError as e:
return e.stringify()


@mcp.tool(
name="oxylabs_amazon_product_scraper",
description="Scrape Amazon Products using Oxylabs Web API",
)
async def scrape_amazon_products(
@mcp.tool(annotations=ToolAnnotations(readOnlyHint=True))
async def amazon_product_scraper(
ctx: Context, # type: ignore[type-arg]
query: url_params.AMAZON_SEARCH_QUERY_PARAM,
autoselect_variant: url_params.AUTOSELECT_VARIANT_CONTEXT_PARAM = False, # noqa: FBT002
Expand All @@ -204,10 +179,17 @@ async def scrape_amazon_products(
domain: url_params.DOMAIN_PARAM = "",
geo_location: url_params.GEO_LOCATION_PARAM = "",
locale: url_params.LOCALE_PARAM = "",
output_format: url_params.OUTPUT_FORMAT_PARAM = "",
) -> str:
"""Scrape Amazon Products using Oxylabs Web API."""
"""Scrape Amazon products.

Supports content parsing, different user agent types, domain,
geolocation, locale parameters and different output formats.
Supports Amazon specific parameters such as currency and getting
more accurate pricing data with auto select variant.
"""
try:
async with oxylabs_client(ctx, with_auth=True) as client:
async with oxylabs_client(ctx) as client:
payload: dict[str, Any] = {"source": "amazon_product", "query": query}

context = []
Expand Down Expand Up @@ -235,7 +217,7 @@ async def scrape_amazon_products(

response.raise_for_status()

return get_content(response, parse)
return get_content(response, parse=parse, output_format=output_format)
except MCPServerError as e:
return e.stringify()

Expand Down
Loading