Skip to content

Commit d391b00

Browse files
[!!!][FEATURE] Don't write crawling response body by default
1 parent cce7efd commit d391b00

File tree

8 files changed

+409
-1
lines changed

8 files changed

+409
-1
lines changed

docs/config-reference/crawler-options.md

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,3 +324,49 @@ CACHE_WARMUP_CRAWLER_OPTIONS='{"request_options": {"delay": 500, "timeout": 10}}
324324
```
325325

326326
:::
327+
328+
### `write_response_body` <Badge type="tip" text="4.0+" />
329+
330+
<small>🎨&nbsp;Type: `bool` &middot; 🐝&nbsp;Default: `false`</small>
331+
332+
> Define whether or not to write response body of crawled URLs to the corresponding
333+
> response object.
334+
335+
::: warning
336+
Enabling this option may significantly increase memory consumption during cache warmup.
337+
:::
338+
339+
::: code-group
340+
341+
```bash [CLI]
342+
./cache-warmup.phar --crawler-options '{"write_response_body": true}'
343+
```
344+
345+
```json [JSON]
346+
{
347+
"crawlerOptions": {
348+
"write_response_body": true
349+
}
350+
}
351+
```
352+
353+
```php [PHP]
354+
use EliasHaeussler\CacheWarmup;
355+
356+
return static function (CacheWarmup\Config\CacheWarmupConfig $config) {
357+
$config->setCrawlerOption('write_response_body', true);
358+
359+
return $config;
360+
};
361+
```
362+
363+
```yaml [YAML]
364+
crawlerOptions:
365+
write_response_body: true
366+
```
367+
368+
```bash [.env]
369+
CACHE_WARMUP_CRAWLER_OPTIONS='{"write_response_body": true}'
370+
```
371+
372+
:::

src/Crawler/ConcurrentCrawler.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
* request_headers: array<string, string>,
4444
* request_options: array<string, mixed>,
4545
* client_config: array<string, mixed>,
46+
* write_response_body: bool,
4647
* }>
4748
*/
4849
final class ConcurrentCrawler extends AbstractConfigurableCrawler implements LoggingCrawler, StoppableCrawler

src/Crawler/ConcurrentCrawlerTrait.php

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,13 @@
2626
use EliasHaeussler\CacheWarmup\Http;
2727
use GuzzleHttp\ClientInterface;
2828
use GuzzleHttp\Pool;
29+
use GuzzleHttp\RequestOptions;
2930
use Psr\Http\Message;
3031
use Symfony\Component\OptionsResolver;
3132

33+
use function array_key_exists;
34+
use function fopen;
35+
3236
/**
3337
* ConcurrentCrawlerTrait.
3438
*
@@ -65,6 +69,11 @@ protected function configureOptions(OptionsResolver\OptionsResolver $optionsReso
6569
->allowedTypes('array')
6670
->default([])
6771
;
72+
73+
$optionsResolver->define('write_response_body')
74+
->allowedTypes('bool')
75+
->default(false)
76+
;
6877
}
6978

7079
/**
@@ -81,11 +90,19 @@ protected function createPool(
8190
$this->options['request_method'],
8291
$this->options['request_headers'],
8392
);
93+
$options = $this->options['request_options'];
94+
95+
if (!$this->options['write_response_body'] && !array_key_exists(RequestOptions::SINK, $options)) {
96+
Http\Message\Stream\NullStream::register();
97+
$options[RequestOptions::SINK] = fopen('null:///', 'w+');
98+
} else {
99+
Http\Message\Stream\NullStream::unregister();
100+
}
84101

85102
return Http\Message\RequestPoolFactory::create($requestFactory->buildIterable($urls))
86103
->withClient($client)
87104
->withConcurrency($this->options['concurrency'])
88-
->withOptions($this->options['request_options'])
105+
->withOptions($options)
89106
->withResponseHandler(...$handlers)
90107
->withStopOnFailure($stopOnFailure)
91108
->createPool()

src/Crawler/OutputtingCrawler.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
* request_headers: array<string, string>,
4747
* request_options: array<string, mixed>,
4848
* client_config: array<string, mixed>,
49+
* write_response_body: bool,
4950
* }>
5051
*/
5152
final class OutputtingCrawler extends AbstractConfigurableCrawler implements LoggingCrawler, StoppableCrawler, VerboseCrawler
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
/*
6+
* This file is part of the Composer package "eliashaeussler/cache-warmup".
7+
*
8+
* Copyright (C) 2020-2024 Elias Häußler <[email protected]>
9+
*
10+
* This program is free software: you can redistribute it and/or modify
11+
* it under the terms of the GNU General Public License as published by
12+
* the Free Software Foundation, either version 3 of the License, or
13+
* (at your option) any later version.
14+
*
15+
* This program is distributed in the hope that it will be useful,
16+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
17+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18+
* GNU General Public License for more details.
19+
*
20+
* You should have received a copy of the GNU General Public License
21+
* along with this program. If not, see <https://www.gnu.org/licenses/>.
22+
*/
23+
24+
namespace EliasHaeussler\CacheWarmup\Http\Message\Stream;
25+
26+
use function in_array;
27+
use function stream_get_wrappers;
28+
use function stream_wrapper_register;
29+
use function stream_wrapper_unregister;
30+
31+
/**
32+
* NullStream.
33+
*
34+
* @author Elias Häußler <[email protected]>
35+
* @license GPL-3.0-or-later
36+
*/
37+
final class NullStream
38+
{
39+
private const PROTOCOL = 'null';
40+
41+
/**
42+
* @var resource|null
43+
*/
44+
public $context;
45+
46+
public static function register(): void
47+
{
48+
if (!self::isRegistered()) {
49+
stream_wrapper_register(self::PROTOCOL, self::class);
50+
}
51+
}
52+
53+
public static function unregister(): void
54+
{
55+
if (self::isRegistered()) {
56+
stream_wrapper_unregister(self::PROTOCOL);
57+
}
58+
}
59+
60+
private static function isRegistered(): bool
61+
{
62+
return in_array(self::PROTOCOL, stream_get_wrappers(), true);
63+
}
64+
65+
public function stream_close(): void {}
66+
67+
public function stream_eof(): bool
68+
{
69+
return true;
70+
}
71+
72+
public function stream_flush(): bool
73+
{
74+
return true;
75+
}
76+
77+
public function stream_open(string $path, string $mode, int $options, ?string &$opened_path): bool
78+
{
79+
return true;
80+
}
81+
82+
public function stream_read(int $count): string
83+
{
84+
return '';
85+
}
86+
87+
public function stream_seek(int $count, int $whence = SEEK_SET): bool
88+
{
89+
return true;
90+
}
91+
92+
/**
93+
* @return array{}
94+
*/
95+
public function stream_stat(): array
96+
{
97+
return [];
98+
}
99+
100+
public function stream_tell(): int
101+
{
102+
return 0;
103+
}
104+
105+
public function stream_write(string $data): int
106+
{
107+
// 1 is enough for curl handler to not fail during writing
108+
return '' === $data ? 0 : 1;
109+
}
110+
}

tests/unit/Crawler/ConcurrentCrawlerTest.php

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,13 @@
2828
use EliasHaeussler\TransientLogger;
2929
use GuzzleHttp\Client;
3030
use GuzzleHttp\Psr7;
31+
use GuzzleHttp\RequestOptions;
3132
use PHPUnit\Framework;
3233
use Psr\Http\Message;
3334
use Psr\Log;
3435

36+
use function stream_get_meta_data;
37+
3538
/**
3639
* ConcurrentCrawlerTest.
3740
*
@@ -94,6 +97,45 @@ public function crawlIgnoresGivenClientConfigIfInstantiatedClientIsPassed(): voi
9497
self::assertNull($this->mockHandler->getLastRequest());
9598
}
9699

100+
#[Framework\Attributes\Test]
101+
public function crawlDoesNotWritesResponseBodyByDefault(): void
102+
{
103+
$this->mockHandler->append(new Psr7\Response());
104+
105+
$this->subject->crawl([new Psr7\Uri('https://www.example.org')]);
106+
107+
$lastOptions = $this->mockHandler->getLastOptions();
108+
$sink = $lastOptions[RequestOptions::SINK] ?? null;
109+
110+
self::assertIsResource($sink);
111+
self::assertInstanceOf(
112+
Src\Http\Message\Stream\NullStream::class,
113+
stream_get_meta_data($sink)['wrapper_data'],
114+
);
115+
}
116+
117+
#[Framework\Attributes\Test]
118+
public function crawlWritesResponseBodyIfConfigured(): void
119+
{
120+
$this->mockHandler->append(new Psr7\Response());
121+
122+
$subject = new Src\Crawler\ConcurrentCrawler(
123+
[
124+
'client_config' => [
125+
'handler' => $this->mockHandler,
126+
],
127+
'write_response_body' => true,
128+
],
129+
);
130+
131+
$subject->crawl([new Psr7\Uri('https://www.example.org')]);
132+
133+
$lastOptions = $this->mockHandler->getLastOptions();
134+
$sink = $lastOptions[RequestOptions::SINK] ?? null;
135+
136+
self::assertNull($sink);
137+
}
138+
97139
#[Framework\Attributes\Test]
98140
public function crawlSendsRequestToAllGivenUrls(): void
99141
{

tests/unit/Crawler/OutputtingCrawlerTest.php

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,14 @@
2828
use EliasHaeussler\TransientLogger;
2929
use GuzzleHttp\Client;
3030
use GuzzleHttp\Psr7;
31+
use GuzzleHttp\RequestOptions;
3132
use PHPUnit\Framework;
3233
use Psr\Http\Message;
3334
use Psr\Log;
3435
use Symfony\Component\Console;
3536

37+
use function stream_get_meta_data;
38+
3639
/**
3740
* OutputtingCrawlerTest.
3841
*
@@ -100,6 +103,45 @@ public function crawlIgnoresGivenClientConfigIfInstantiatedClientIsPassed(): voi
100103
self::assertNull($this->mockHandler->getLastRequest());
101104
}
102105

106+
#[Framework\Attributes\Test]
107+
public function crawlDoesNotWritesResponseBodyByDefault(): void
108+
{
109+
$this->mockHandler->append(new Psr7\Response());
110+
111+
$this->subject->crawl([new Psr7\Uri('https://www.example.org')]);
112+
113+
$lastOptions = $this->mockHandler->getLastOptions();
114+
$sink = $lastOptions[RequestOptions::SINK] ?? null;
115+
116+
self::assertIsResource($sink);
117+
self::assertInstanceOf(
118+
Src\Http\Message\Stream\NullStream::class,
119+
stream_get_meta_data($sink)['wrapper_data'],
120+
);
121+
}
122+
123+
#[Framework\Attributes\Test]
124+
public function crawlWritesResponseBodyIfConfigured(): void
125+
{
126+
$this->mockHandler->append(new Psr7\Response());
127+
128+
$subject = new Src\Crawler\ConcurrentCrawler(
129+
[
130+
'client_config' => [
131+
'handler' => $this->mockHandler,
132+
],
133+
'write_response_body' => true,
134+
],
135+
);
136+
137+
$subject->crawl([new Psr7\Uri('https://www.example.org')]);
138+
139+
$lastOptions = $this->mockHandler->getLastOptions();
140+
$sink = $lastOptions[RequestOptions::SINK] ?? null;
141+
142+
self::assertNull($sink);
143+
}
144+
103145
#[Framework\Attributes\Test]
104146
public function crawlSendsRequestToAllGivenUrls(): void
105147
{

0 commit comments

Comments
 (0)