Skip to content

Commit 61078ca

Browse files
authored
v0.3.5 (#89)
2 parents 165dc52 + 82b0962 commit 61078ca

24 files changed

+259
-283
lines changed

.github/workflows/tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ jobs:
5858
- name: Install all browsers dependencies
5959
run: |
6060
python3 -m pip install --upgrade pip
61-
python3 -m pip install playwright==1.52.0 rebrowser-playwright==1.52.0 camoufox
61+
python3 -m pip install playwright>=1.55.0 patchright>=1.55.0 camoufox
6262
6363
- name: Retrieve Playwright browsers from cache if any
6464
id: playwright-cache

README.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -322,10 +322,9 @@ This project includes code adapted from:
322322
## Thanks and References
323323

324324
- [Daijro](https://github.com/daijro)'s brilliant work on [BrowserForge](https://github.com/daijro/browserforge) and [Camoufox](https://github.com/daijro/camoufox)
325-
- [Vinyzu](https://github.com/Vinyzu)'s work on [Botright](https://github.com/Vinyzu/Botright)
325+
- [Vinyzu](https://github.com/Vinyzu)'s brilliant work on [Botright](https://github.com/Vinyzu/Botright) and [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
326326
- [brotector](https://github.com/kaliiiiiiiiii/brotector) for browser detection bypass techniques
327-
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) for fingerprinting research
328-
- [rebrowser-patches](https://github.com/rebrowser/rebrowser-patches) for stealth improvements
327+
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) and [BotBrowser](https://github.com/botswin/BotBrowser) for fingerprinting research
329328

330329
---
331330
<div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>

docs/fetching/dynamic.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ It's the same as the vanilla Playwright option, but it provides a simple stealth
3535

3636
Some of the things this fetcher's stealth mode does include:
3737

38-
* Patching the CDP runtime fingerprint.
38+
* Patching the CDP runtime fingerprint through using PatchRight.
3939
* Mimics some of the real browsers' properties by injecting several JS files and using custom options.
4040
* Custom flags are used on launch to hide Playwright even more and make it faster.
4141
* Generates real browser headers of the same type and user OS, then appends them to the request's headers.

pyproject.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,16 +64,16 @@ dependencies = [
6464

6565
[project.optional-dependencies]
6666
fetchers = [
67-
"click>=8.2.1",
67+
"click>=8.3.0",
6868
"curl_cffi>=0.13.0",
69-
"playwright>=1.52.0",
70-
"rebrowser-playwright>=1.52.0",
69+
"playwright>=1.55.0",
70+
"patchright>=1.55.2",
7171
"camoufox>=0.4.11",
7272
"geoip2>=5.1.0",
7373
"msgspec>=0.19.0",
7474
]
7575
ai = [
76-
"mcp>=1.14.0",
76+
"mcp>=1.14.1",
7777
"markdownify>=1.2.0",
7878
"scrapling[fetchers]",
7979
]

scrapling/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
__author__ = "Karim Shoair ([email protected])"
2-
__version__ = "0.3.4"
2+
__version__ = "0.3.5"
33
__copyright__ = "Copyright (c) 2024 Karim Shoair"
44

55

scrapling/cli.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@ def __ParseJSONData(json_string: Optional[str] = None) -> Optional[Dict[str, Any
3232

3333
try:
3434
return json_loads(json_string)
35-
except JSONDecodeError as e: # pragma: no cover
36-
raise ValueError(f"Invalid JSON data '{json_string}': {e}")
35+
except JSONDecodeError as err: # pragma: no cover
36+
raise ValueError(f"Invalid JSON data '{json_string}': {err}")
3737

3838

3939
def __Request_and_Save(
@@ -65,8 +65,8 @@ def __ParseExtractArguments(
6565
for key, value in _CookieParser(cookies):
6666
try:
6767
parsed_cookies[key] = value
68-
except Exception as e:
69-
raise ValueError(f"Could not parse cookies '{cookies}': {e}")
68+
except Exception as err:
69+
raise ValueError(f"Could not parse cookies '{cookies}': {err}")
7070

7171
parsed_json = __ParseJSONData(json)
7272
parsed_params = {}

scrapling/core/custom_types.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ def re(
145145
clean_match: bool = False,
146146
case_sensitive: bool = True,
147147
check_match: Literal[False] = False,
148-
) -> "TextHandlers[TextHandler]": ...
148+
) -> "TextHandlers": ...
149149

150150
def re(
151151
self,
@@ -241,7 +241,7 @@ def re(
241241
replace_entities: bool = True,
242242
clean_match: bool = False,
243243
case_sensitive: bool = True,
244-
) -> "TextHandlers[TextHandler]":
244+
) -> "TextHandlers":
245245
"""Call the ``.re()`` method for each element in this list and return
246246
their results flattened as TextHandlers.
247247

scrapling/core/shell.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ def parse(self, curl_command: str) -> Optional[Request]:
201201
data_payload = parsed_args.data_binary # Fallback to string
202202

203203
elif parsed_args.data_raw is not None:
204-
data_payload = parsed_args.data_raw
204+
data_payload = parsed_args.data_raw.lstrip("$")
205205

206206
elif parsed_args.data is not None:
207207
data_payload = parsed_args.data
@@ -317,8 +317,8 @@ def show_page_in_browser(page: Selector): # pragma: no cover
317317

318318
try:
319319
fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
320-
with open(fd, "wb") as f:
321-
f.write(page.body)
320+
with open(fd, "w", encoding=page.encoding) as f:
321+
f.write(page.html_content)
322322

323323
open_in_browser(f"file://{fname}")
324324
except IOError as e:
@@ -545,7 +545,7 @@ def _extract_content(
545545
for page in pages:
546546
match extraction_type:
547547
case "markdown":
548-
yield cls._convert_to_markdown(page.body)
548+
yield cls._convert_to_markdown(page.html_content)
549549
case "html":
550550
yield page.body
551551
case "text":

scrapling/engines/_browsers/_base.py

Lines changed: 2 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from time import time, sleep
1+
from time import time
22
from asyncio import sleep as asyncio_sleep, Lock
33

44
from camoufox import DefaultAddons
@@ -44,23 +44,7 @@ def _get_page(
4444
) -> PageInfo: # pragma: no cover
4545
"""Get a new page to use"""
4646

47-
# Close all finished pages to ensure clean state
48-
self.page_pool.close_all_finished_pages()
49-
50-
# If we're at max capacity after cleanup, wait for busy pages to finish
51-
if self.page_pool.pages_count >= self.max_pages:
52-
start_time = time()
53-
while time() - start_time < self._max_wait_for_page:
54-
# Wait for any pages to finish, then clean them up
55-
sleep(0.05)
56-
self.page_pool.close_all_finished_pages()
57-
if self.page_pool.pages_count < self.max_pages:
58-
break
59-
else:
60-
raise TimeoutError(
61-
f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
62-
)
63-
47+
# No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.
6448
page = self.context.new_page()
6549
page.set_default_navigation_timeout(timeout)
6650
page.set_default_timeout(timeout)
@@ -76,11 +60,6 @@ def _get_page(
7660

7761
return self.page_pool.add_page(page)
7862

79-
@staticmethod
80-
def _get_with_precedence(request_value: Any, session_value: Any, sentinel_value: object) -> Any:
81-
"""Get value with request-level priority over session-level"""
82-
return request_value if request_value is not sentinel_value else session_value
83-
8463
def get_pool_stats(self) -> Dict[str, int]:
8564
"""Get statistics about the current page pool"""
8665
return {
@@ -105,16 +84,11 @@ async def _get_page(
10584
) -> PageInfo: # pragma: no cover
10685
"""Get a new page to use"""
10786
async with self._lock:
108-
# Close all finished pages to ensure clean state
109-
await self.page_pool.aclose_all_finished_pages()
110-
11187
# If we're at max capacity after cleanup, wait for busy pages to finish
11288
if self.page_pool.pages_count >= self.max_pages:
11389
start_time = time()
11490
while time() - start_time < self._max_wait_for_page:
115-
# Wait for any pages to finish, then clean them up
11691
await asyncio_sleep(0.05)
117-
await self.page_pool.aclose_all_finished_pages()
11892
if self.page_pool.pages_count < self.max_pages:
11993
break
12094
else:

scrapling/engines/_browsers/_camoufox.py

Lines changed: 39 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
)
1717
from playwright._impl._errors import Error as PlaywrightError
1818

19-
from ._validators import validate, CamoufoxConfig
19+
from ._validators import validate_fetch as _validate
2020
from ._base import SyncSession, AsyncSession, StealthySessionMixin
2121
from scrapling.core.utils import log
2222
from scrapling.core._types import (
@@ -297,23 +297,22 @@ def fetch(
297297
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
298298
:return: A `Response` object.
299299
"""
300-
# Validate all resolved parameters
301-
params = validate(
302-
dict(
303-
google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
304-
timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
305-
wait=self._get_with_precedence(wait, self.wait, _UNSET),
306-
page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
307-
extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
308-
disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
309-
wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
310-
wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
311-
network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
312-
load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
313-
solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
314-
selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
315-
),
316-
CamoufoxConfig,
300+
params = _validate(
301+
[
302+
("google_search", google_search, self.google_search),
303+
("timeout", timeout, self.timeout),
304+
("wait", wait, self.wait),
305+
("page_action", page_action, self.page_action),
306+
("extra_headers", extra_headers, self.extra_headers),
307+
("disable_resources", disable_resources, self.disable_resources),
308+
("wait_selector", wait_selector, self.wait_selector),
309+
("wait_selector_state", wait_selector_state, self.wait_selector_state),
310+
("network_idle", network_idle, self.network_idle),
311+
("load_dom", load_dom, self.load_dom),
312+
("solve_cloudflare", solve_cloudflare, self.solve_cloudflare),
313+
("selector_config", selector_config, self.selector_config),
314+
],
315+
_UNSET,
317316
)
318317

319318
if self._closed: # pragma: no cover
@@ -381,8 +380,9 @@ def handle_response(finished_response: SyncPlaywrightResponse):
381380
page_info.page, first_response, final_response, params.selector_config
382381
)
383382

384-
# Mark the page as finished for next use
385-
page_info.mark_finished()
383+
# Close the page, to free up resources
384+
page_info.page.close()
385+
self.page_pool.pages.remove(page_info)
386386

387387
return response
388388

@@ -616,22 +616,22 @@ async def fetch(
616616
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
617617
:return: A `Response` object.
618618
"""
619-
params = validate(
620-
dict(
621-
google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
622-
timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
623-
wait=self._get_with_precedence(wait, self.wait, _UNSET),
624-
page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
625-
extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
626-
disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
627-
wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
628-
wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
629-
network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
630-
load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
631-
solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
632-
selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
633-
),
634-
CamoufoxConfig,
619+
params = _validate(
620+
[
621+
("google_search", google_search, self.google_search),
622+
("timeout", timeout, self.timeout),
623+
("wait", wait, self.wait),
624+
("page_action", page_action, self.page_action),
625+
("extra_headers", extra_headers, self.extra_headers),
626+
("disable_resources", disable_resources, self.disable_resources),
627+
("wait_selector", wait_selector, self.wait_selector),
628+
("wait_selector_state", wait_selector_state, self.wait_selector_state),
629+
("network_idle", network_idle, self.network_idle),
630+
("load_dom", load_dom, self.load_dom),
631+
("solve_cloudflare", solve_cloudflare, self.solve_cloudflare),
632+
("selector_config", selector_config, self.selector_config),
633+
],
634+
_UNSET,
635635
)
636636

637637
if self._closed: # pragma: no cover
@@ -701,8 +701,9 @@ async def handle_response(finished_response: AsyncPlaywrightResponse):
701701
page_info.page, first_response, final_response, params.selector_config
702702
)
703703

704-
# Mark the page as finished for next use
705-
page_info.mark_finished()
704+
# Close the page, to free up resources
705+
await page_info.page.close()
706+
self.page_pool.pages.remove(page_info)
706707

707708
return response
708709

0 commit comments

Comments
 (0)