Skip to content

Commit c50a8b4

Browse files
committed
Modify requirements to enable specific chaches, update examples
1 parent a587147 commit c50a8b4

File tree

10 files changed

+118
-81
lines changed

10 files changed

+118
-81
lines changed

examples/application.cr

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
require "../src/squirm/caches/redis"
12
require "../src/squirm"
23
require "./human_resources/**"
34
require "./resourceful_humans/**"
@@ -6,14 +7,29 @@ Log.setup(:debug)
67

78
engine = Squirm::Engine.new
89

9-
engine.add_spider(HumanResources::Spider.new)
10-
engine.add_spider(ResourcefulHumans::Spider.new)
10+
spiders = [
11+
HumanResources::Spider.new,
12+
ResourcefulHumans::Spider.new,
13+
] of Squirm::Spider
14+
15+
spiders.each do |spider|
16+
engine.add_spider(spider)
17+
end
18+
19+
engine.run
1120

1221
loop do
13-
sleep 60
22+
spiders.each do |spider|
23+
unless Squirm::RequestStorage.instance.empty?(spider.id)
24+
size = Squirm::RequestStorage
25+
.instance
26+
.requests
27+
.[spider.id]
28+
.size
1429

15-
engine.spiders.each do |spider|
16-
queue_size = Squirm::RequestStorage.instance.requests[spider.id].size
17-
Log.info { "Spider #{spider.id} is running and has queued #{queue_size} requests." } if queue_size != 0
30+
Log.debug { "#{spider.id} running with #{size} request(s)" }
31+
end
1832
end
19-
end
33+
34+
sleep 30
35+
end

examples/human_resources/spider.cr

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,10 @@ module HumanResources
1515
property start_urls : Array(String) = ["https://www.hr.gov.ge/?pageNo=1"]
1616

1717
# Caching mechanism used by the spider to cache the requests in case of a restart/failure.
18-
property cache : Squirm::Caches::Base = Squirm::Caches::RocksDB.new(@@id)
18+
property cache : Squirm::Caches::Base = Squirm::Caches::Redis.new(@@id)
19+
20+
# Used by the engine to fetch the URLs.
21+
property fetcher : Squirm::Fetchers::Base = Squirm::Fetchers::Default.new
1922

2023
# Parser used by the spider to parse the HTML content.
2124
property parser : Squirm::Parser = Parser.new
@@ -26,21 +29,19 @@ module HumanResources
2629
# Used by the spider to filter the responses.
2730
property response_filters : Array(Squirm::ResponseFilters::Base) = [Squirm::ResponseFilters::ContentValidator.new(selector: ".Title-box")] of Squirm::ResponseFilters::Base
2831

29-
# Time spent between each request
30-
property timeout : Time::Span = 5.seconds
32+
# Time spent between each request.
33+
property request_timeout : Time::Span = 5.seconds
3134

32-
# Concurrent requests per domain
33-
property concurrent_requests_per_domain : Int32 = 5
35+
# Concurrent requests per domain.
36+
property concurrent_requests_per_domain : Int32 = 2
3437

35-
# Used by the engine to fetch the URLs
36-
property fetcher : Squirm::Fetchers::Base = Squirm::Fetchers::Default.new
3738

3839
# Used by the caching mechanism to retrieve the requests from the cache.
3940
def start_requests : Array(Squirm::Request)
4041
cache.list_requests!(base_url())
4142
end
4243

43-
# Parsing logic to identify the listing URLs and pagination URLs
44+
# Parsing logic to identify the listing URLs and pagination URLs.
4445
def parse_item(request : Squirm::Request, response : Squirm::Response) : Squirm::ParsedItem
4546
cache.delete!(request.url)
4647

@@ -76,7 +77,7 @@ module HumanResources
7677
.map { |href| Squirm::Utils.build_absolute_url(href, base_url) }
7778
end
7879

79-
# Parse HTML for pagination URLs
80+
# Parse HTML for pagination URLs.
8081
def pagination_urls(document : Lexbor::Parser) : Array(String)
8182
document
8283
.find("li.PagedList-skipToNext a")

examples/resourceful_humans/spider.cr

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ module ResourcefulHumans
1717
# Caching mechanism used by the spider to cache the requests in case of a restart/failure.
1818
property cache : Squirm::Caches::Base = Squirm::Caches::Redis.new(@@id)
1919

20+
# If you want to use the Chrome fetcher add the chromedriver to your PATH.
21+
property fetcher : Squirm::Fetchers::Base = Squirm::Fetchers::Default.new
22+
2023
# Parser used by the spider to parse the HTML content.
2124
property parser : Squirm::Parser = Parser.new
2225

@@ -26,23 +29,18 @@ module ResourcefulHumans
2629
# Used by the spider to filter the responses.
2730
property response_filters : Array(Squirm::ResponseFilters::Base) = [Squirm::ResponseFilters::ContentValidator.new(selector: ".ann-title")] of Squirm::ResponseFilters::Base
2831

29-
# Time spent between each request
30-
property timeout : Time::Span = 5.seconds
31-
32-
# Concurrent requests per domain
33-
property concurrent_requests_per_domain : Int32 = 5
32+
# Time spent between each request.
33+
property request_timeout : Time::Span = 5.seconds
3434

35-
#
36-
# If you want to use the Chrome fetcher add the chromedriver to your PATH
37-
#
38-
property fetcher : Squirm::Fetchers::Base = Squirm::Fetchers::Chrome.new
35+
# Concurrent requests per domain.
36+
property concurrent_requests_per_domain : Int32 = 2
3937

4038
# Used by the caching mechanism to retrieve the requests from the cache.
4139
def start_requests : Array(Squirm::Request)
4240
cache.list_requests!(base_url())
4341
end
4442

45-
# Parsing logic to identify the listing URLs and pagination URLs
43+
# Parsing logic to identify the listing URLs and pagination URLs.
4644
def parse_item(request : Squirm::Request, response : Squirm::Response) : Squirm::ParsedItem
4745
cache.delete!(request.url)
4846

@@ -78,7 +76,7 @@ module ResourcefulHumans
7876
.map { |href| Squirm::Utils.build_absolute_url(href, base_url) }
7977
end
8078

81-
# Parse HTML for pagination URLs
79+
# Parse HTML for pagination URLs.
8280
def pagination_urls(document : Lexbor::Parser) : Array(String)
8381
document
8482
.find(".paging-container a.item")

shard.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
name: squirm
2-
version: 0.1.1
2+
version: 0.2.0
33

44
authors:
55
- Giorgi Kavrelishvili <[email protected]>
@@ -29,7 +29,6 @@ dependencies:
2929
development_dependencies:
3030
ameba:
3131
github: crystal-ameba/ameba
32-
version: ~> 0.13.0
3332

3433
crystal: ~> 1.2.0
3534

src/squirm.cr

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@ require "lexbor"
33
require "robots"
44
require "log"
55

6-
require "./squirm/**"
6+
require "./squirm/fetchers/**"
7+
require "./squirm/ext/**"
8+
require "./squirm/request_filters/**"
9+
require "./squirm/response_filters/**"
10+
require "./squirm/*"
711

812
module Squirm
913
{% unless flag?(:preview_mt) %}

src/squirm/caches/redis.cr

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
require "./base"
12
require "redis"
23

34
module Squirm

src/squirm/caches/rocksdb.cr

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
require "./base"
12
require "rocksdb"
23

34
module Squirm

src/squirm/engine.cr

Lines changed: 33 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -5,44 +5,48 @@ module Squirm
55
getter spiders : Synchronized(Array(Spider)) = Synchronized(Array(Spider)).new
66

77
def add_spider(spider : Spider)
8-
RequestStorage.instance.store(spider, spider.start_urls) if spider.start_requests.empty?
9-
RequestStorage.instance.store(spider, spider.start_requests)
8+
RequestStorage.instance.store(spider.id, spider.start_urls) if spider.start_requests.empty?
9+
RequestStorage.instance.store(spider.id, spider.start_requests)
1010

1111
@spiders.push(spider)
12+
end
1213

13-
spawn do
14-
pool = Pool.new(spider.concurrent_requests_per_domain)
15-
fetcher = spider.fetcher
14+
def run
15+
spiders.each do |spider|
16+
spawn do
17+
pool = Pool.new(spider.concurrent_requests_per_domain)
18+
fetcher = spider.fetcher
1619

17-
loop do
18-
unless RequestStorage.instance.empty?(spider)
19-
request = RequestStorage.instance.pop!(spider)
20-
request.spider = spider
20+
loop do
21+
unless RequestStorage.instance.empty?(spider.id)
22+
request = RequestStorage.instance.pop!(spider.id)
23+
request.spider = spider
2124

22-
pool.spawn do
23-
begin
24-
response = fetcher.fetch(request)
25+
pool.spawn do
26+
begin
27+
response = fetcher.fetch(request)
2528

26-
parsed_item = spider.parse_item(request, response)
27-
parse(spider, parsed_item)
29+
parsed_item = spider.parse_item(request, response)
30+
parse(spider, parsed_item)
2831

29-
sleep(spider.timeout)
30-
rescue exception : Crest::RequestFailed
31-
status_code = exception.response.status_code.to_i
32+
sleep(spider.request_timeout)
33+
rescue exception : Crest::RequestFailed
34+
status_code = exception.response.status_code.to_i
3235

33-
case status_code
34-
when 429, 500..511
35-
Log.error(exception: exception) { exception.message }
36+
case status_code
37+
when 429, 500..511
38+
Log.error(exception: exception) { exception.message }
3639

37-
if request.retriable?
38-
request.retry
39-
RequestStorage.instance.store(spider, request)
40+
if request.retriable?
41+
request.retry
42+
RequestStorage.instance.store(spider.id, request)
43+
end
44+
else
45+
Log.error(exception: exception) { "Dropping the request, failed to get a response status code which could be used to recover a request." }
4046
end
41-
else
42-
Log.error(exception: exception) { "Dropping the request, failed to get a response status code which could be used to recover a request." }
47+
rescue exception : Exception
48+
Log.error(exception: exception) { "Dropping the request, a non HTTP error occured." }
4349
end
44-
rescue exception : Exception
45-
Log.error(exception: exception) { "Dropping the request, a non HTTP error occured." }
4650
end
4751
end
4852
end
@@ -52,7 +56,7 @@ module Squirm
5256

5357
def remove_spider(spider : Spider)
5458
spider.cache.flush
55-
RequestStorage.instance.flush(spider)
59+
RequestStorage.instance.flush(spider.id)
5660
@spiders.delete(spider)
5761
end
5862

@@ -62,7 +66,7 @@ module Squirm
6266
end
6367

6468
private def parse_requests(spider : Spider, parsed_item : ParsedItem)
65-
RequestStorage.instance.store(spider, parsed_item.requests)
69+
RequestStorage.instance.store(spider.id, parsed_item.requests)
6670
end
6771

6872
private def parse_items(spider : Spider, parsed_item : ParsedItem)

src/squirm/request_storage.cr

Lines changed: 34 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -11,48 +11,61 @@ module Squirm
1111
getter requests : Synchronized(Hash(String, Array(Request))) = Synchronized(Hash(String, Array(Request))).new
1212
getter history : Synchronized(Hash(String, Array(String))) = Synchronized(Hash(String, Array(String))).new
1313

14-
def store(spider : Spider, request : Request)
15-
if @requests.has_key?(spider.id)
16-
unless @history[spider.id].includes?(request.url)
17-
@history[spider.id].push(request.url)
18-
@requests[spider.id].push(request)
14+
def store(id : String, request : Request)
15+
if @requests.has_key?(id)
16+
unless @history[id].includes?(request.url)
17+
@history[id].push(request.url)
18+
@requests[id].push(request)
1919
end
2020
else
21-
@history[spider.id] = [request.url]
22-
@requests[spider.id] = [request]
21+
@history[id] = [request.url]
22+
@requests[id] = [request]
2323
end
2424
end
2525

26-
def store(spider : Spider, requests : Array(Request))
26+
def store(id : String, requests : Array(Request))
2727
requests.each do |request|
28-
store(spider, request)
28+
store(id, request)
2929
end
3030
end
3131

32-
def store(spider : Spider, url : String)
33-
store(spider, Request.new(:get, url))
32+
def store(id : String, url : String)
33+
store(id, Request.new(:get, url))
3434
end
3535

36-
def store(spider : Spider, urls : Array(String))
36+
def store(id : String, urls : Array(String))
3737
urls.each do |url|
38-
store(spider, url)
38+
store(id, url)
3939
end
4040
end
4141

42-
def pop!(spider : Spider) : Request
43-
@requests[spider.id].pop
42+
def flush(id : String)
43+
@requests[id] = [] of Request
4444
end
4545

46-
def pop?(spider : Spider) : Request?
47-
@requests[spider.id].pop?
46+
def clear(id : String)
47+
@requests[id] = [] of Request
48+
@history[id] = [] of String
4849
end
4950

50-
def flush(spider : Spider)
51-
@requests[spider.id] = [] of Request
51+
def pop!(id : String)
52+
@requests[id].pop? || raise Exception.new("Request storage is empty")
5253
end
5354

54-
def empty?(spider : Spider) : Bool
55-
@requests[spider.id].empty?
55+
def delete_history(id : String, url : String)
56+
@history[id].delete(url)
57+
end
58+
59+
def seen?(id : String, url : String) : Bool
60+
@history[id].includes?(url)
61+
end
62+
63+
def empty?(id : String) : Bool
64+
@requests[id].empty?
65+
end
66+
67+
def exists?(id : String) : Bool
68+
@requests[id]? != nil || @history[id]? != nil
5669
end
5770
end
5871
end

src/squirm/spider.cr

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ module Squirm
1212
abstract def parse_item(request : Request, response : Response) : ParsedItem
1313
abstract def request_filters : Array(RequestFilters::Base)
1414
abstract def response_filters : Array(ResponseFilters::Base)
15-
abstract def timeout : Time::Span
15+
abstract def request_timeout : Time::Span
1616
abstract def concurrent_requests_per_domain : Int32
1717
end
1818
end

0 commit comments

Comments
 (0)