Skip to content

Commit 6fb72be

Browse files
committed
spidy v1.3
Improved file saving: All files saved and opened with UTF-8 encoding, and UnicodeErrors are ignored.
1 parent e8c9405 commit 6fb72be

File tree

5 files changed

+91
-70
lines changed

5 files changed

+91
-70
lines changed

README.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,21 @@ Pretty simple!
77
Developed by [rivermont](https://github.com/rivermont) (/rɪvɜːrmɒnt/) and [FalconWarriorr](https://github.com/Casillas-) (/fælcʌnraɪjɔːr/).<br>
88
Looking for technical documentation? Check out [docs.md](https://github.com/rivermont/spidy/blob/master/docs.md)
99

10-
[![Version: 1.2.0](https://img.shields.io/badge/version-1.2.0-brightgreen.svg)](https://github.com/rivermont/spidy/releases)
10+
[![Version: 1.3.0](https://img.shields.io/badge/version-1.2.0-brightgreen.svg)](https://github.com/rivermont/spidy/releases)
1111
[![License: GPL v3](https://img.shields.io/badge/license-GPLv3.0-blue.svg)](http://www.gnu.org/licenses/gpl-3.0)
1212
[![Python: 3.5](https://img.shields.io/badge/python-3.5-brightgreen.svg)](https://docs.python.org/3/)
1313
[![Python: 3](https://img.shields.io/badge/python-3-lightgrey.svg)](https://docs.python.org/3/)
1414
<br>
15-
[![Lines of Code: 933](https://img.shields.io/badge/lines%20of%20code-933-green.svg)](#)
16-
[![Lines of Docs: 537](https://img.shields.io/badge/lines%20of%20docs-537-orange.svg)](#)
15+
[![Lines of Code: 930](https://img.shields.io/badge/lines%20of%20code-930-green.svg)](#)
16+
[![Lines of Docs: 538](https://img.shields.io/badge/lines%20of%20docs-538-orange.svg)](#)
1717

1818
--------------------
1919

2020
# New Features!
2121

2222
### Domain Limiting - #[e229b01](https://github.com/rivermont/spidy/commit/e229b01eed7e1f95530d06afc671e40dbf4dac53)
23-
Scrape only a single site instead of the whole internet. May use slightly less space on your disk.
23+
Scrape only a single site instead of the whole internet. May use slightly less space on your disk.<br>
24+
See `/config/wsj.cfg` for an example.
2425

2526
### Release v1.0!
2627
[spidy Web Crawler Release 1.0](https://github.com/rivermont/spidy/releases/tag/1.0)

config/rivermont.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
OVERWRITE = False
2-
RAISE_ERRORS = True
2+
RAISE_ERRORS = False
33
SAVE_PAGES = True
44
ZIP_FILES = False
55
SAVE_WORDS = False

config/wsj.cfg

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
OVERWRITE = False
2+
RAISE_ERRORS = False
3+
SAVE_PAGES = True
4+
SAVE_WORDS = False
5+
ZIP_FILES = False
6+
7+
# Whether to restrict crawling to a single domain or not.
8+
RESTRICT = True
9+
10+
# The domain within which to restrict crawling.
11+
DOMAIN = 'wsj.com/'
12+
13+
TODO_FILE = 'wsj_todo.txt'
14+
DONE_FILE = 'wsj_done.txt'
15+
WORD_FILE = 'wsj_words.txt'
16+
BAD_FILE = 'wsj_bad.txt'
17+
SAVE_COUNT = 60
18+
HEADER = HEADERS['spidy']
19+
MAX_NEW_ERRORS = 100
20+
MAX_KNOWN_ERRORS = 100
21+
MAX_HTTP_ERRORS = 100
22+
MAX_NEW_MIMES = 5
23+
START = ['https://www.wsj.com/']

crawler.py

Lines changed: 62 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
spidy Web Crawler
33
Built by rivermont and FalconWarriorr
44
"""
5-
VERSION = '1.2.0'
5+
VERSION = '1.3.0'
66

77
##########
88
# IMPORT #
@@ -20,6 +20,7 @@ def get_time():
2020
def get_full_time():
2121
return t.strftime('%H:%M:%S, %A %b %Y')
2222

23+
2324
START_TIME = int(t.time())
2425
START_TIME_LONG = get_time()
2526

@@ -34,7 +35,7 @@ def get_full_time():
3435
except OSError:
3536
pass # Assumes only OSError wil complain logs/ already exists
3637

37-
LOG_FILE = open('{0}\\logs\\spidy_log_{1}.txt'.format(CRAWLER_DIR, START_TIME), 'w+')
38+
LOG_FILE = open('{0}\\logs\\spidy_log_{1}.txt'.format(CRAWLER_DIR, START_TIME), 'w+', encoding='utf-8', errors='ignore')
3839
LOG_FILE_NAME = 'logs\\spidy_log_{0}'.format(START_TIME)
3940

4041

@@ -47,6 +48,7 @@ def write_log(message):
4748
print(message)
4849
LOG_FILE.write('\n' + message)
4950

51+
5052
write_log('[INIT]: Starting spidy Web Crawler version {0}'.format(VERSION))
5153
write_log('[INIT]: Importing required libraries...')
5254

@@ -131,7 +133,7 @@ def make_words(site):
131133
"""
132134
Returns list of all valid words in page.
133135
"""
134-
page = str(site.content) # Get page content
136+
page = site.text # Get page content
135137
word_list = page.split() # Split content into lists of words, as separated by spaces
136138
del page
137139
word_list = list(set(word_list)) # Remove duplicates
@@ -146,15 +148,15 @@ def save_files():
146148
Saves the TODO, done, word, and bad lists into their respective files.
147149
Also logs the action to the console.
148150
"""
149-
with open(TODO_FILE, 'w') as todoList:
151+
with open(TODO_FILE, 'w', encoding='utf-8', errors='ignore') as todoList:
150152
for site in TODO:
151153
try:
152154
todoList.write(site + '\n') # Save TODO list
153155
except UnicodeError:
154156
continue
155157
write_log('[LOG]: Saved TODO list to {0}'.format(TODO_FILE))
156158

157-
with open(DONE_FILE, 'w') as done_list:
159+
with open(DONE_FILE, 'w', encoding='utf-8', errors='ignore') as done_list:
158160
for site in DONE:
159161
try:
160162
done_list.write(site + '\n') # Save done list
@@ -219,16 +221,16 @@ def save_page(url, page):
219221
file_path = '{0}\\saved\\{1}{2}'.format(CRAWLER_DIR, cropped_url, ext)
220222

221223
# Save file
222-
with open(file_path, 'wb+') as file:
223-
file.write(bytes('''<!-- "{0}" -->
224+
with open(file_path, 'w', encoding='utf-8', errors='ignore') as file:
225+
file.write('''<!-- "{0}" -->
224226
<!-- Downloaded with the spidy Web Crawler -->
225227
<!-- https://github.com/rivermont/spidy -->
226-
'''.format(url), 'ascii'))
227-
file.write(page.content)
228+
'''.format(url))
229+
file.write(page.text)
228230

229231

230232
def update_file(file, content, file_type):
231-
with open(file, 'r+') as open_file: # Open save file for reading and writing
233+
with open(file, 'r+', encoding='utf-8', errors='ignore') as open_file: # Open save file for reading and writing
232234
file_content = open_file.readlines() # Make list of all lines in file
233235
contents = []
234236
for x in file_content:
@@ -263,7 +265,7 @@ def log(message):
263265
Logs a single message to the error log file.
264266
Prints message verbatim, so message must be formatted correctly in the function call.
265267
"""
266-
with open(ERR_LOG_FILE, 'a') as open_file:
268+
with open(ERR_LOG_FILE, 'a', encoding='utf-8', errors='ignore') as open_file:
267269
open_file.write('\n\n======LOG======') # Write opening line
268270
open_file.write('\nTIME: {0}'.format(get_full_time())) # Write current time
269271
open_file.write(message) # Write message
@@ -293,7 +295,7 @@ def err_log(url, error1, error2):
293295
error2 is the extended text of the error.
294296
"""
295297
time = t.strftime('%H:%M:%S, %A %b %Y') # Get the current time
296-
with open(ERR_LOG_FILE, 'a') as work_log:
298+
with open(ERR_LOG_FILE, 'a', encoding='utf-8', errors='ignore') as work_log:
297299
work_log.write('\n\n=====ERROR=====') # Write opening line
298300
work_log.write('\nTIME: {0}\nURL: {1}\nERROR: {2}\nEXT: {3}'.format(time, url, error1, str(error2)))
299301
work_log.write(LOG_END) # Write closing line
@@ -539,7 +541,7 @@ def init():
539541
else:
540542
file_path = 'config\\{0}.cfg'.format(input_)
541543
write_log('[INFO]: Loading configuration settings from {0}'.format(file_path))
542-
with open(file_path, 'r') as file:
544+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
543545
for line in file.readlines():
544546
exec(line, globals())
545547
except FileNotFoundError:
@@ -730,15 +732,15 @@ def init():
730732
write_log('[INIT]: Loading save files...')
731733
# Import saved TODO file data
732734
try:
733-
with open(TODO_FILE, 'r') as f:
735+
with open(TODO_FILE, 'r', encoding='utf-8', errors='ignore') as f:
734736
contents = f.readlines()
735737
except FileNotFoundError: # If no TODO file is present
736738
contents = []
737739
for line in contents:
738740
TODO.append(line.strip())
739741
# Import saved done file data
740742
try:
741-
with open(DONE_FILE, 'r') as f:
743+
with open(DONE_FILE, 'r', encoding='utf-8', errors='ignore') as f:
742744
contents = f.readlines()
743745
except FileNotFoundError: # If no DONE file is present
744746
contents = []
@@ -775,10 +777,10 @@ def main():
775777
pass # Assumes only OSError wil complain saved/ already exists
776778

777779
# Create required files
778-
with open(WORD_FILE, 'w'):
780+
with open(WORD_FILE, 'w', encoding='utf-8', errors='ignore'):
779781
pass
780782

781-
with open(BAD_FILE, 'w'):
783+
with open(BAD_FILE, 'w', encoding='utf-8', errors='ignore'):
782784
pass
783785

784786
write_log('[INIT]: Successfully started spidy Web Crawler version {0}...'.format(VERSION))
@@ -817,7 +819,7 @@ def main():
817819
word_list = make_words(page) # Get all words from page
818820
WORDS.update(word_list) # Add words to word list
819821
try:
820-
links = [link for element, attribute, link, pos in html.iterlinks(page.content)]
822+
links = [link for element, attribute, link, pos in html.iterlinks(page.text)]
821823
except (etree.XMLSyntaxError, etree.ParserError):
822824
links = []
823825
links = list(set(links)) # Remove duplicates and shuffle links
@@ -844,58 +846,53 @@ def main():
844846
write_log('[INFO]: An error was raised trying to process {0}'.format(link))
845847
err_mro = type(e).mro()
846848

847-
# HTTP Errors
848-
if str(e) == 'HTTP Error 403: Forbidden':
849-
write_log('[ERR]: HTTP 403: Access Forbidden.')
850-
BAD_LINKS.add(link)
851-
852-
elif str(e) == 'HTTP Error 429: Too Many Requests':
853-
write_log('[ERR]: HTTP 429: Too Many Requests.')
854-
TODO += TODO[0] # Move link to end of TODO list
855-
856-
elif etree.XMLSyntaxError in err_mro or etree.ParserError in err_mro: # Error processing html/xml
857-
KNOWN_ERROR_COUNT += 1
858-
write_log('[ERR]: An XMLSyntaxError occurred. A web dev screwed up somewhere.')
859-
err_log(link, 'XMLSyntaxError', e)
860-
861-
elif UnicodeError in err_mro: # Error trying to convert foreign characters to Unicode
862-
KNOWN_ERROR_COUNT += 1
863-
write_log('[ERR]: A UnicodeError occurred. URL had a foreign character or something.')
864-
err_log(link, 'UnicodeError', e)
865-
866-
elif requests.exceptions.SSLError in err_mro: # Invalid SSL certificate
867-
KNOWN_ERROR_COUNT += 1
868-
write_log('[ERR]: An SSLError occurred. Site is using an invalid certificate.')
869-
err_log(link, 'SSLError', e)
870-
BAD_LINKS.add(link)
871-
872-
elif requests.exceptions.ConnectionError in err_mro: # Error connecting to page
873-
KNOWN_ERROR_COUNT += 1
874-
write_log('[ERR]: A ConnectionError occurred. There\'s something wrong with somebody\'s network.')
875-
err_log(link, 'ConnectionError', e)
876-
877-
elif requests.exceptions.TooManyRedirects in err_mro: # Exceeded 30 redirects.
878-
KNOWN_ERROR_COUNT += 1
879-
write_log('[ERR]: A TooManyRedirects error occurred. Page is probably part of a redirect loop.')
880-
err_log(link, 'TooManyRedirects', e)
881-
BAD_LINKS.add(link)
882-
883-
elif requests.exceptions.ContentDecodingError in err_mro:
884-
# Received response with content-encoding: gzip, but failed to decode it.
885-
KNOWN_ERROR_COUNT += 1
886-
write_log('[ERR]: A ContentDecodingError occurred. Probably just a zip bomb, nothing to worry about.')
887-
err_log(link, 'ContentDecodingError', e)
888-
889-
elif OSError in err_mro:
849+
if OSError in err_mro:
890850
KNOWN_ERROR_COUNT += 1
891851
write_log('[ERR]: An OSError occurred.')
892852
err_log(link, 'OSError', e)
893853
BAD_LINKS.add(link)
894854

895-
elif 'Unknown MIME type' in str(e):
896-
NEW_MIME_COUNT += 1
897-
write_log('[ERR]: Unknown MIME type: {0}'.format(str(e)[18:]))
898-
err_log(link, 'Unknown MIME', e)
855+
# HTTP Errors
856+
# elif str(e) == 'HTTP Error 403: Forbidden':
857+
# write_log('[ERR]: HTTP 403: Access Forbidden.')
858+
# BAD_LINKS.add(link)
859+
860+
# elif str(e) == 'HTTP Error 429: Too Many Requests':
861+
# write_log('[ERR]: HTTP 429: Too Many Requests.')
862+
# TODO += TODO[0] # Move link to end of TODO list
863+
864+
# elif etree.XMLSyntaxError in err_mro or etree.ParserError in err_mro: # Error processing html/xml
865+
# KNOWN_ERROR_COUNT += 1
866+
# write_log('[ERR]: An XMLSyntaxError occurred. A web dev screwed up somewhere.')
867+
# err_log(link, 'XMLSyntaxError', e)
868+
869+
# elif requests.exceptions.SSLError in err_mro: # Invalid SSL certificate
870+
# KNOWN_ERROR_COUNT += 1
871+
# write_log('[ERR]: An SSLError occurred. Site is using an invalid certificate.')
872+
# err_log(link, 'SSLError', e)
873+
# BAD_LINKS.add(link)
874+
875+
# elif requests.exceptions.ConnectionError in err_mro: # Error connecting to page
876+
# KNOWN_ERROR_COUNT += 1
877+
# write_log('[ERR]: A ConnectionError occurred. There\'s something wrong with somebody\'s network.')
878+
# err_log(link, 'ConnectionError', e)
879+
880+
# elif requests.exceptions.TooManyRedirects in err_mro: # Exceeded 30 redirects.
881+
# KNOWN_ERROR_COUNT += 1
882+
# write_log('[ERR]: A TooManyRedirects error occurred. Page is probably part of a redirect loop.')
883+
# err_log(link, 'TooManyRedirects', e)
884+
# BAD_LINKS.add(link)
885+
886+
# elif requests.exceptions.ContentDecodingError in err_mro:
887+
# # Received response with content-encoding: gzip, but failed to decode it.
888+
# KNOWN_ERROR_COUNT += 1
889+
# write_log('[ERR]: A ContentDecodingError occurred. Probably just a zip bomb, nothing to worry about.')
890+
# err_log(link, 'ContentDecodingError', e)
891+
892+
# elif 'Unknown MIME type' in str(e):
893+
# NEW_MIME_COUNT += 1
894+
# write_log('[ERR]: Unknown MIME type: {0}'.format(str(e)[18:]))
895+
# err_log(link, 'Unknown MIME', e)
899896

900897
else: # Any other error
901898
NEW_ERROR_COUNT += 1

media/physics.dll.png

-2.46 KB
Binary file not shown.

0 commit comments

Comments
 (0)