2
2
spidy Web Crawler
3
3
Built by rivermont and FalconWarriorr
4
4
"""
5
- VERSION = '1.2 .0'
5
+ VERSION = '1.3 .0'
6
6
7
7
##########
8
8
# IMPORT #
@@ -20,6 +20,7 @@ def get_time():
20
20
def get_full_time ():
21
21
return t .strftime ('%H:%M:%S, %A %b %Y' )
22
22
23
+
23
24
START_TIME = int (t .time ())
24
25
START_TIME_LONG = get_time ()
25
26
@@ -34,7 +35,7 @@ def get_full_time():
34
35
except OSError :
35
36
pass # Assumes only OSError wil complain logs/ already exists
36
37
37
- LOG_FILE = open ('{0}\\ logs\\ spidy_log_{1}.txt' .format (CRAWLER_DIR , START_TIME ), 'w+' )
38
+ LOG_FILE = open ('{0}\\ logs\\ spidy_log_{1}.txt' .format (CRAWLER_DIR , START_TIME ), 'w+' , encoding = 'utf-8' , errors = 'ignore' )
38
39
LOG_FILE_NAME = 'logs\\ spidy_log_{0}' .format (START_TIME )
39
40
40
41
@@ -47,6 +48,7 @@ def write_log(message):
47
48
print (message )
48
49
LOG_FILE .write ('\n ' + message )
49
50
51
+
50
52
write_log ('[INIT]: Starting spidy Web Crawler version {0}' .format (VERSION ))
51
53
write_log ('[INIT]: Importing required libraries...' )
52
54
@@ -131,7 +133,7 @@ def make_words(site):
131
133
"""
132
134
Returns list of all valid words in page.
133
135
"""
134
- page = str ( site .content ) # Get page content
136
+ page = site .text # Get page content
135
137
word_list = page .split () # Split content into lists of words, as separated by spaces
136
138
del page
137
139
word_list = list (set (word_list )) # Remove duplicates
@@ -146,15 +148,15 @@ def save_files():
146
148
Saves the TODO, done, word, and bad lists into their respective files.
147
149
Also logs the action to the console.
148
150
"""
149
- with open (TODO_FILE , 'w' ) as todoList :
151
+ with open (TODO_FILE , 'w' , encoding = 'utf-8' , errors = 'ignore' ) as todoList :
150
152
for site in TODO :
151
153
try :
152
154
todoList .write (site + '\n ' ) # Save TODO list
153
155
except UnicodeError :
154
156
continue
155
157
write_log ('[LOG]: Saved TODO list to {0}' .format (TODO_FILE ))
156
158
157
- with open (DONE_FILE , 'w' ) as done_list :
159
+ with open (DONE_FILE , 'w' , encoding = 'utf-8' , errors = 'ignore' ) as done_list :
158
160
for site in DONE :
159
161
try :
160
162
done_list .write (site + '\n ' ) # Save done list
@@ -219,16 +221,16 @@ def save_page(url, page):
219
221
file_path = '{0}\\ saved\\ {1}{2}' .format (CRAWLER_DIR , cropped_url , ext )
220
222
221
223
# Save file
222
- with open (file_path , 'wb+ ' ) as file :
223
- file .write (bytes ( '''<!-- "{0}" -->
224
+ with open (file_path , 'w' , encoding = 'utf-8' , errors = 'ignore ' ) as file :
225
+ file .write ('''<!-- "{0}" -->
224
226
<!-- Downloaded with the spidy Web Crawler -->
225
227
<!-- https://github.com/rivermont/spidy -->
226
- ''' .format (url ), 'ascii' ) )
227
- file .write (page .content )
228
+ ''' .format (url ))
229
+ file .write (page .text )
228
230
229
231
230
232
def update_file (file , content , file_type ):
231
- with open (file , 'r+' ) as open_file : # Open save file for reading and writing
233
+ with open (file , 'r+' , encoding = 'utf-8' , errors = 'ignore' ) as open_file : # Open save file for reading and writing
232
234
file_content = open_file .readlines () # Make list of all lines in file
233
235
contents = []
234
236
for x in file_content :
@@ -263,7 +265,7 @@ def log(message):
263
265
Logs a single message to the error log file.
264
266
Prints message verbatim, so message must be formatted correctly in the function call.
265
267
"""
266
- with open (ERR_LOG_FILE , 'a' ) as open_file :
268
+ with open (ERR_LOG_FILE , 'a' , encoding = 'utf-8' , errors = 'ignore' ) as open_file :
267
269
open_file .write ('\n \n ======LOG======' ) # Write opening line
268
270
open_file .write ('\n TIME: {0}' .format (get_full_time ())) # Write current time
269
271
open_file .write (message ) # Write message
@@ -293,7 +295,7 @@ def err_log(url, error1, error2):
293
295
error2 is the extended text of the error.
294
296
"""
295
297
time = t .strftime ('%H:%M:%S, %A %b %Y' ) # Get the current time
296
- with open (ERR_LOG_FILE , 'a' ) as work_log :
298
+ with open (ERR_LOG_FILE , 'a' , encoding = 'utf-8' , errors = 'ignore' ) as work_log :
297
299
work_log .write ('\n \n =====ERROR=====' ) # Write opening line
298
300
work_log .write ('\n TIME: {0}\n URL: {1}\n ERROR: {2}\n EXT: {3}' .format (time , url , error1 , str (error2 )))
299
301
work_log .write (LOG_END ) # Write closing line
@@ -539,7 +541,7 @@ def init():
539
541
else :
540
542
file_path = 'config\\ {0}.cfg' .format (input_ )
541
543
write_log ('[INFO]: Loading configuration settings from {0}' .format (file_path ))
542
- with open (file_path , 'r' ) as file :
544
+ with open (file_path , 'r' , encoding = 'utf-8' , errors = 'ignore' ) as file :
543
545
for line in file .readlines ():
544
546
exec (line , globals ())
545
547
except FileNotFoundError :
@@ -730,15 +732,15 @@ def init():
730
732
write_log ('[INIT]: Loading save files...' )
731
733
# Import saved TODO file data
732
734
try :
733
- with open (TODO_FILE , 'r' ) as f :
735
+ with open (TODO_FILE , 'r' , encoding = 'utf-8' , errors = 'ignore' ) as f :
734
736
contents = f .readlines ()
735
737
except FileNotFoundError : # If no TODO file is present
736
738
contents = []
737
739
for line in contents :
738
740
TODO .append (line .strip ())
739
741
# Import saved done file data
740
742
try :
741
- with open (DONE_FILE , 'r' ) as f :
743
+ with open (DONE_FILE , 'r' , encoding = 'utf-8' , errors = 'ignore' ) as f :
742
744
contents = f .readlines ()
743
745
except FileNotFoundError : # If no DONE file is present
744
746
contents = []
@@ -775,10 +777,10 @@ def main():
775
777
pass # Assumes only OSError wil complain saved/ already exists
776
778
777
779
# Create required files
778
- with open (WORD_FILE , 'w' ):
780
+ with open (WORD_FILE , 'w' , encoding = 'utf-8' , errors = 'ignore' ):
779
781
pass
780
782
781
- with open (BAD_FILE , 'w' ):
783
+ with open (BAD_FILE , 'w' , encoding = 'utf-8' , errors = 'ignore' ):
782
784
pass
783
785
784
786
write_log ('[INIT]: Successfully started spidy Web Crawler version {0}...' .format (VERSION ))
@@ -817,7 +819,7 @@ def main():
817
819
word_list = make_words (page ) # Get all words from page
818
820
WORDS .update (word_list ) # Add words to word list
819
821
try :
820
- links = [link for element , attribute , link , pos in html .iterlinks (page .content )]
822
+ links = [link for element , attribute , link , pos in html .iterlinks (page .text )]
821
823
except (etree .XMLSyntaxError , etree .ParserError ):
822
824
links = []
823
825
links = list (set (links )) # Remove duplicates and shuffle links
@@ -844,58 +846,53 @@ def main():
844
846
write_log ('[INFO]: An error was raised trying to process {0}' .format (link ))
845
847
err_mro = type (e ).mro ()
846
848
847
- # HTTP Errors
848
- if str (e ) == 'HTTP Error 403: Forbidden' :
849
- write_log ('[ERR]: HTTP 403: Access Forbidden.' )
850
- BAD_LINKS .add (link )
851
-
852
- elif str (e ) == 'HTTP Error 429: Too Many Requests' :
853
- write_log ('[ERR]: HTTP 429: Too Many Requests.' )
854
- TODO += TODO [0 ] # Move link to end of TODO list
855
-
856
- elif etree .XMLSyntaxError in err_mro or etree .ParserError in err_mro : # Error processing html/xml
857
- KNOWN_ERROR_COUNT += 1
858
- write_log ('[ERR]: An XMLSyntaxError occurred. A web dev screwed up somewhere.' )
859
- err_log (link , 'XMLSyntaxError' , e )
860
-
861
- elif UnicodeError in err_mro : # Error trying to convert foreign characters to Unicode
862
- KNOWN_ERROR_COUNT += 1
863
- write_log ('[ERR]: A UnicodeError occurred. URL had a foreign character or something.' )
864
- err_log (link , 'UnicodeError' , e )
865
-
866
- elif requests .exceptions .SSLError in err_mro : # Invalid SSL certificate
867
- KNOWN_ERROR_COUNT += 1
868
- write_log ('[ERR]: An SSLError occurred. Site is using an invalid certificate.' )
869
- err_log (link , 'SSLError' , e )
870
- BAD_LINKS .add (link )
871
-
872
- elif requests .exceptions .ConnectionError in err_mro : # Error connecting to page
873
- KNOWN_ERROR_COUNT += 1
874
- write_log ('[ERR]: A ConnectionError occurred. There\' s something wrong with somebody\' s network.' )
875
- err_log (link , 'ConnectionError' , e )
876
-
877
- elif requests .exceptions .TooManyRedirects in err_mro : # Exceeded 30 redirects.
878
- KNOWN_ERROR_COUNT += 1
879
- write_log ('[ERR]: A TooManyRedirects error occurred. Page is probably part of a redirect loop.' )
880
- err_log (link , 'TooManyRedirects' , e )
881
- BAD_LINKS .add (link )
882
-
883
- elif requests .exceptions .ContentDecodingError in err_mro :
884
- # Received response with content-encoding: gzip, but failed to decode it.
885
- KNOWN_ERROR_COUNT += 1
886
- write_log ('[ERR]: A ContentDecodingError occurred. Probably just a zip bomb, nothing to worry about.' )
887
- err_log (link , 'ContentDecodingError' , e )
888
-
889
- elif OSError in err_mro :
849
+ if OSError in err_mro :
890
850
KNOWN_ERROR_COUNT += 1
891
851
write_log ('[ERR]: An OSError occurred.' )
892
852
err_log (link , 'OSError' , e )
893
853
BAD_LINKS .add (link )
894
854
895
- elif 'Unknown MIME type' in str (e ):
896
- NEW_MIME_COUNT += 1
897
- write_log ('[ERR]: Unknown MIME type: {0}' .format (str (e )[18 :]))
898
- err_log (link , 'Unknown MIME' , e )
855
+ # HTTP Errors
856
+ # elif str(e) == 'HTTP Error 403: Forbidden':
857
+ # write_log('[ERR]: HTTP 403: Access Forbidden.')
858
+ # BAD_LINKS.add(link)
859
+
860
+ # elif str(e) == 'HTTP Error 429: Too Many Requests':
861
+ # write_log('[ERR]: HTTP 429: Too Many Requests.')
862
+ # TODO += TODO[0] # Move link to end of TODO list
863
+
864
+ # elif etree.XMLSyntaxError in err_mro or etree.ParserError in err_mro: # Error processing html/xml
865
+ # KNOWN_ERROR_COUNT += 1
866
+ # write_log('[ERR]: An XMLSyntaxError occurred. A web dev screwed up somewhere.')
867
+ # err_log(link, 'XMLSyntaxError', e)
868
+
869
+ # elif requests.exceptions.SSLError in err_mro: # Invalid SSL certificate
870
+ # KNOWN_ERROR_COUNT += 1
871
+ # write_log('[ERR]: An SSLError occurred. Site is using an invalid certificate.')
872
+ # err_log(link, 'SSLError', e)
873
+ # BAD_LINKS.add(link)
874
+
875
+ # elif requests.exceptions.ConnectionError in err_mro: # Error connecting to page
876
+ # KNOWN_ERROR_COUNT += 1
877
+ # write_log('[ERR]: A ConnectionError occurred. There\'s something wrong with somebody\'s network.')
878
+ # err_log(link, 'ConnectionError', e)
879
+
880
+ # elif requests.exceptions.TooManyRedirects in err_mro: # Exceeded 30 redirects.
881
+ # KNOWN_ERROR_COUNT += 1
882
+ # write_log('[ERR]: A TooManyRedirects error occurred. Page is probably part of a redirect loop.')
883
+ # err_log(link, 'TooManyRedirects', e)
884
+ # BAD_LINKS.add(link)
885
+
886
+ # elif requests.exceptions.ContentDecodingError in err_mro:
887
+ # # Received response with content-encoding: gzip, but failed to decode it.
888
+ # KNOWN_ERROR_COUNT += 1
889
+ # write_log('[ERR]: A ContentDecodingError occurred. Probably just a zip bomb, nothing to worry about.')
890
+ # err_log(link, 'ContentDecodingError', e)
891
+
892
+ # elif 'Unknown MIME type' in str(e):
893
+ # NEW_MIME_COUNT += 1
894
+ # write_log('[ERR]: Unknown MIME type: {0}'.format(str(e)[18:]))
895
+ # err_log(link, 'Unknown MIME', e)
899
896
900
897
else : # Any other error
901
898
NEW_ERROR_COUNT += 1
0 commit comments