Skip to content

Commit 0ae6742

Browse files
committed
Add Geolocation and User Agent type params to universal scraper, remove parse parameter for universal scraper, update tests
1 parent d92d845 commit 0ae6742

File tree

6 files changed

+357
-373
lines changed

6 files changed

+357
-373
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
# Changelog
22

3-
## [0.2.0] - 2025-05-12
3+
## [0.2.0] - 2025-05-13
44

55
### Added
66

77
- Changelog
88
- E2E tests
9+
- Geolocation and User Agent type parameters to universal scraper
910

1011
### Changed
1112

@@ -16,3 +17,4 @@
1617
### Removed
1718

1819
- WebUnblocker tool
20+
- Parse parameter for universal scraper

src/oxylabs_mcp/server.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@
1616
async def universal_scraper(
1717
ctx: Context, # type: ignore[type-arg]
1818
url: url_params.URL_PARAM,
19-
parse: url_params.PARSE_PARAM = False, # noqa: FBT002
2019
render: url_params.RENDER_PARAM = "",
20+
user_agent_type: url_params.USER_AGENT_TYPE_PARAM = "",
21+
geo_location: url_params.GEO_LOCATION_PARAM = "",
2122
output_format: url_params.OUTPUT_FORMAT_PARAM = "",
2223
) -> str:
2324
"""Get a content of any webpage.
@@ -28,16 +29,19 @@ async def universal_scraper(
2829
try:
2930
async with oxylabs_client(ctx) as client:
3031
payload: dict[str, Any] = {"url": url}
31-
if parse:
32-
payload["parse"] = parse
32+
3333
if render:
3434
payload["render"] = render
35+
if user_agent_type:
36+
payload["user_agent_type"] = user_agent_type
37+
if geo_location:
38+
payload["geo_location"] = geo_location
3539

3640
response = await client.post(settings.OXYLABS_SCRAPER_URL, json=payload)
3741

3842
response.raise_for_status()
3943

40-
return get_content(response, parse=parse, output_format=output_format)
44+
return get_content(response, output_format=output_format)
4145
except MCPServerError as e:
4246
return e.stringify()
4347

src/oxylabs_mcp/utils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -208,18 +208,18 @@ def extract_links_with_text(html: str, base_url: str | None = None) -> list[str]
208208
return links
209209

210210

211-
def get_content(response: Response, *, parse: bool, output_format: str) -> str:
211+
def get_content(response: Response, *, output_format: str, parse: bool = False) -> str:
212212
"""Extract content from response and convert to a proper format."""
213213
content = response.json()["results"][0]["content"]
214214
if parse and isinstance(content, dict):
215215
return json.dumps(content)
216216
if output_format == "html":
217217
return str(content)
218-
if output_format == "md":
219-
striped_html = strip_html(str(content))
220-
return markdownify(striped_html) # type: ignore[no-any-return]
221218
if output_format == "links":
222219
links = extract_links_with_text(str(content))
223220
return "\n".join(links)
221+
if output_format in ("md", ""):
222+
striped_html = strip_html(str(content))
223+
return markdownify(striped_html) # type: ignore[no-any-return]
224224

225225
return str(content)

tests/e2e/test_llm_agent.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,15 @@ async def oxylabs_mcp_server():
8686
},
8787
"iPhone 16",
8888
),
89+
(
90+
"Search for iPhone 16 in google with browser rendering",
91+
"google_search_scraper",
92+
{
93+
"query": "iPhone 16",
94+
"render": "html",
95+
},
96+
"iPhone 16",
97+
),
8998
(
9099
"Search for iPhone 16 in google with user agent type mobile",
91100
"google_search_scraper",

tests/integration/params.py

Lines changed: 84 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,28 @@
88
{"query": "Generic query"},
99
does_not_raise(),
1010
{"results": [{"content": "Mocked content"}]},
11-
"Mocked content",
11+
"\n\nMocked content\n\n",
1212
id="query-only-args",
1313
)
1414
PARSE_ENABLED = pytest.param(
1515
{"query": "Generic query", "parse": True},
1616
does_not_raise(),
17-
{"results": [{"content": '{"data": "value"}'}]},
17+
{"results": [{"content": {"data": "value"}}]},
1818
'{"data": "value"}',
1919
id="parse-enabled-args",
2020
)
21-
RENDER_HTML = pytest.param(
21+
RENDER_HTML_WITH_QUERY = pytest.param(
2222
{"query": "Generic query", "render": "html"},
2323
does_not_raise(),
2424
{"results": [{"content": "Mocked content"}]},
25-
"Mocked content",
25+
"\n\nMocked content\n\n",
26+
id="render-enabled-args",
27+
)
28+
RENDER_INVALID_WITH_QUERY = pytest.param(
29+
{"query": "Generic query", "render": "png"},
30+
pytest.raises(ToolError),
31+
{},
32+
None,
2633
id="render-enabled-args",
2734
)
2835
OUTPUT_FORMATS = [
@@ -54,12 +61,33 @@
5461
id="html-output-format-args",
5562
),
5663
]
57-
USER_AGENTS = [
64+
USER_AGENTS_WITH_QUERY = [
5865
pytest.param(
59-
{"query": "Generic query", "user_agent_type": "mobile"},
66+
{"query": "Generic query", "user_agent_type": uat},
6067
does_not_raise(),
6168
{"results": [{"content": "Mocked content"}]},
62-
"Mocked content",
69+
"\n\nMocked content\n\n",
70+
id=f"{uat}-user-agent-specified-args",
71+
)
72+
for uat in [
73+
"desktop",
74+
"desktop_chrome",
75+
"desktop_firefox",
76+
"desktop_safari",
77+
"desktop_edge",
78+
"desktop_opera",
79+
"mobile",
80+
"mobile_ios",
81+
"mobile_android",
82+
"tablet",
83+
]
84+
]
85+
USER_AGENTS_WITH_URL = [
86+
pytest.param(
87+
{"url": "https://example.com", "user_agent_type": uat},
88+
does_not_raise(),
89+
{"results": [{"content": "Mocked content"}]},
90+
"\n\nMocked content\n\n",
6391
id=f"{uat}-user-agent-specified-args",
6492
)
6593
for uat in [
@@ -85,91 +113,126 @@
85113
START_PAGE_SPECIFIED = pytest.param(
86114
{"query": "Generic query", "start_page": 2},
87115
does_not_raise(),
88-
{"results": [{"content": '{"data": "value"}'}]},
116+
{"results": [{"content": {"data": "value"}}]},
89117
'{"data": "value"}',
90118
id="start-page-specified-args",
91119
)
92120
START_PAGE_INVALID = pytest.param(
93121
{"query": "Generic query", "start_page": -1},
94122
pytest.raises(ToolError),
95-
{"results": [{"content": '{"data": "value"}'}]},
123+
{"results": [{"content": {"data": "value"}}]},
96124
'{"data": "value"}',
97125
id="start-page-invalid-args",
98126
)
99127
PAGES_SPECIFIED = pytest.param(
100128
{"query": "Generic query", "pages": 20},
101129
does_not_raise(),
102-
{"results": [{"content": '{"data": "value"}'}]},
130+
{"results": [{"content": {"data": "value"}}]},
103131
'{"data": "value"}',
104132
id="pages-specified-args",
105133
)
106134
PAGES_INVALID = pytest.param(
107135
{"query": "Generic query", "pages": -10},
108136
pytest.raises(ToolError),
109-
{"results": [{"content": '{"data": "value"}'}]},
137+
{"results": [{"content": {"data": "value"}}]},
110138
'{"data": "value"}',
111139
id="pages-invalid-args",
112140
)
113141
LIMIT_SPECIFIED = pytest.param(
114142
{"query": "Generic query", "limit": 100},
115143
does_not_raise(),
116-
{"results": [{"content": '{"data": "value"}'}]},
144+
{"results": [{"content": {"data": "value"}}]},
117145
'{"data": "value"}',
118146
id="limit-specified-args",
119147
)
120148
LIMIT_INVALID = pytest.param(
121149
{"query": "Generic query", "limit": 0},
122150
pytest.raises(ToolError),
123-
{"results": [{"content": '{"data": "value"}'}]},
151+
{"results": [{"content": {"data": "value"}}]},
124152
'{"data": "value"}',
125153
id="limit-invalid-args",
126154
)
127155
DOMAIN_SPECIFIED = pytest.param(
128156
{"query": "Generic query", "domain": "io"},
129157
does_not_raise(),
130-
{"results": [{"content": '{"data": "value"}'}]},
158+
{"results": [{"content": {"data": "value"}}]},
131159
'{"data": "value"}',
132160
id="domain-specified-args",
133161
)
134-
GEO_LOCATION_SPECIFIED = pytest.param(
162+
GEO_LOCATION_SPECIFIED_WITH_QUERY = pytest.param(
135163
{"query": "Generic query", "geo_location": "Miami, Florida"},
136164
does_not_raise(),
137-
{"results": [{"content": '{"data": "value"}'}]},
165+
{"results": [{"content": {"data": "value"}}]},
138166
'{"data": "value"}',
139167
id="geo-location-specified-args",
140168
)
169+
GEO_LOCATION_SPECIFIED_WITH_URL = pytest.param(
170+
{"url": "https://example.com", "geo_location": "Miami, Florida"},
171+
does_not_raise(),
172+
{"results": [{"content": "Mocked content"}]},
173+
"\n\nMocked content\n\n",
174+
id="geo-location-specified-args",
175+
)
141176
LOCALE_SPECIFIED = pytest.param(
142177
{"query": "Generic query", "locale": "ja_JP"},
143178
does_not_raise(),
144-
{"results": [{"content": '{"data": "value"}'}]},
179+
{"results": [{"content": {"data": "value"}}]},
145180
'{"data": "value"}',
146181
id="locale-specified-args",
147182
)
148183
CATEGORY_SPECIFIED = pytest.param(
149184
{"query": "Man's T-shirt", "category_id": "QE21R9AV"},
150185
does_not_raise(),
151-
{"results": [{"content": '{"data": "value"}'}]},
186+
{"results": [{"content": {"data": "value"}}]},
152187
'{"data": "value"}',
153188
id="category-id-specified-args",
154189
)
155190
MERCHANT_ID_SPECIFIED = pytest.param(
156191
{"query": "Man's T-shirt", "merchant_id": "QE21R9AV"},
157192
does_not_raise(),
158-
{"results": [{"content": '{"data": "value"}'}]},
193+
{"results": [{"content": {"data": "value"}}]},
159194
'{"data": "value"}',
160195
id="merchant-id-specified-args",
161196
)
162197
CURRENCY_SPECIFIED = pytest.param(
163198
{"query": "Man's T-shirt", "currency": "USD"},
164199
does_not_raise(),
165-
{"results": [{"content": '{"data": "value"}'}]},
200+
{"results": [{"content": {"data": "value"}}]},
166201
'{"data": "value"}',
167202
id="currency-specified-args",
168203
)
169204
AUTOSELECT_VARIANT_ENABLED = pytest.param(
170205
{"query": "B0BVF87BST", "autoselect_variant": True},
171206
does_not_raise(),
172-
{"results": [{"content": '{"data": "value"}'}]},
207+
{"results": [{"content": {"data": "value"}}]},
173208
'{"data": "value"}',
174209
id="autoselect-variant-enabled-args",
175210
)
211+
URL_ONLY = pytest.param(
212+
{"url": "https://example.com"},
213+
does_not_raise(),
214+
{"results": [{"content": "Mocked content"}]},
215+
"\n\nMocked content\n\n",
216+
id="url-only-args",
217+
)
218+
NO_URL = pytest.param(
219+
{},
220+
pytest.raises(ToolError),
221+
{"results": [{"content": "Mocked content"}]},
222+
"\n\nMocked content\n\n",
223+
id="no-url-args",
224+
)
225+
RENDER_HTML_WITH_URL = pytest.param(
226+
{"url": "https://example.com", "render": "html"},
227+
does_not_raise(),
228+
{"results": [{"content": "Mocked content"}]},
229+
"\n\nMocked content\n\n",
230+
id="render-enabled-args",
231+
)
232+
RENDER_INVALID_WITH_URL = pytest.param(
233+
{"url": "https://example.com", "render": "png"},
234+
pytest.raises(ToolError),
235+
{},
236+
None,
237+
id="render-enabled-args",
238+
)

0 commit comments

Comments
 (0)