4
4
import datetime
5
5
from pytz import timezone
6
6
from itertools import compress
7
+ import asyncio
7
8
from slack import send_slack_message
8
- from menu_crawler import text_normalizer , VetRestaurantCrawler , GraduateDormRestaurantCrawler , SnucoRestaurantCrawler
9
+ from menu_crawler import text_normalizer , VetRestaurantCrawler , SnudormRestaurantCrawler , SnucoRestaurantCrawler
9
10
10
11
11
12
def compare_restaurants (db_restaurants , crawled_meals ):
@@ -22,8 +23,20 @@ def compare_restaurants(db_restaurants, crawled_meals):
22
23
return new_restaurants
23
24
24
25
26
+ def remove_duplicate (menus ):
27
+ unique_fields = ['restaurant_id' , 'code' , 'date' , 'type' ]
28
+ unique = [True ] * len (menus )
29
+ for i in range (len (menus )):
30
+ for j in range (i ):
31
+ if all ((menus [i ].get (field ) == menus [j ].get (field )) for field in unique_fields ):
32
+ unique [i ] = False
33
+ break
34
+ return list (compress (menus , unique ))
35
+
36
+
25
37
def compare_menus (db_menus , crawled_meals , restaurants ):
26
- fields = ['restaurant_id' , 'code' , 'date' , 'type' , 'price' , 'etc' ]
38
+ unique_fields = ['restaurant_id' , 'code' , 'date' , 'type' ]
39
+ detail_fields = ['price' , 'etc' ]
27
40
restaurant_dict = {restaurant .get ('code' ): restaurant .get ('id' ) for restaurant in restaurants }
28
41
crawled_menus = [meal .as_dict () for meal in crawled_meals ]
29
42
for menu in crawled_menus :
@@ -33,14 +46,33 @@ def compare_menus(db_menus, crawled_meals, restaurants):
33
46
menu ['name_kr' ] = name
34
47
menu ['code' ] = text_normalizer (name , True )
35
48
49
+ crawled_menus = remove_duplicate (crawled_menus )
50
+
36
51
db_not_found = [True ] * len (db_menus )
37
52
crawled_not_found = [True ] * len (crawled_menus )
53
+ edited = [False ] * len (db_menus )
38
54
for db_idx in range (len (db_menus )):
39
55
for crawled_idx in range (len (crawled_menus )):
40
- if all ((db_menus [db_idx ].get (field , None ) == crawled_menus [crawled_idx ].get (field )) for field in fields ):
56
+ if all ((db_menus [db_idx ].get (field , None ) == crawled_menus [crawled_idx ].get (field )) for field in unique_fields ):
41
57
db_not_found [db_idx ] = False
42
58
crawled_not_found [crawled_idx ] = False
43
- return list (compress (crawled_menus , crawled_not_found )), list (compress (db_menus , db_not_found ))
59
+ for field in detail_fields :
60
+ if db_menus [db_idx ].get (field , None ) != crawled_menus [crawled_idx ].get (field ):
61
+ edited [db_idx ] = True
62
+ db_menus [db_idx ]['previous_' + field ] = db_menus [db_idx ].pop (field , None )
63
+ db_menus [db_idx ][field ] = crawled_menus [crawled_idx ].get (field )
64
+ break
65
+ return list (compress (crawled_menus , crawled_not_found )), list (compress (db_menus , db_not_found )), \
66
+ list (compress (db_menus , edited ))
67
+
68
+
69
+ def send_new_restaurants_message (new_restaurants ):
70
+ print (f"New restaurants: { repr (new_restaurants )} " )
71
+ if new_restaurants :
72
+ slack_message = f"{ len (new_restaurants )} new restaurants found: "
73
+ for restaurant in new_restaurants :
74
+ slack_message = slack_message + '"' + restaurant .get ('name_kr' ) + '" '
75
+ send_slack_message (slack_message )
44
76
45
77
46
78
def restaurants_transaction (crawled_meals , cursor ):
@@ -51,20 +83,39 @@ def restaurants_transaction(crawled_meals, cursor):
51
83
cursor .execute (get_restaurants_query )
52
84
db_restaurants = cursor .fetchall ()
53
85
new_restaurants = compare_restaurants (db_restaurants , crawled_meals )
54
- print (f"New Restaurants: { repr (new_restaurants )} " )
55
- if new_restaurants :
56
- slack_message = "New Restaurant(s) Found: "
57
- for restaurant in new_restaurants :
58
- slack_message = slack_message + '"' + restaurant .get ('name_kr' ) + '" '
59
- send_slack_message (slack_message )
60
- insert_restaurants_query = """
61
- INSERT INTO restaurant(code, name_kr)
62
- VALUES (%(code)s, %(name_kr)s);
63
- """
64
- cursor .executemany (insert_restaurants_query , new_restaurants )
86
+ send_new_restaurants_message (new_restaurants )
87
+ insert_restaurants_query = """
88
+ INSERT INTO restaurant(code, name_kr)
89
+ VALUES (%(code)s, %(name_kr)s);
90
+ """
91
+ cursor .executemany (insert_restaurants_query , new_restaurants )
65
92
print ("Restaurants checked" )
66
93
67
94
95
+ def send_deleted_menus_message (deleted_menus ):
96
+ print (f"Menus deleted: { repr (deleted_menus )} " )
97
+ if deleted_menus :
98
+ send_slack_message (f"{ len (deleted_menus )} menus deleted: { repr (deleted_menus )} " )
99
+
100
+
101
+ def send_new_menus_message (new_menus ):
102
+ slack_message = f"{ len (new_menus )} new menus found: "
103
+ for menu in new_menus :
104
+ name_kr = menu .get ('name_kr' )
105
+ if ':' in name_kr :
106
+ slack_message = slack_message + '*"' + menu .get ('name_kr' ) + '"* '
107
+ else :
108
+ slack_message = slack_message + '"' + menu .get ('name_kr' ) + '" '
109
+ send_slack_message (slack_message )
110
+ print (f"New menus found: { repr (new_menus )} " )
111
+
112
+
113
+ def send_edited_menus_message (edited_menus ):
114
+ print (f"Menus edited: { repr (edited_menus )} " )
115
+ if edited_menus :
116
+ send_slack_message (f"{ len (edited_menus )} menus edited: { repr (edited_menus )} " )
117
+
118
+
68
119
def menus_transaction (crawled_meals , cursor ):
69
120
get_restaurants_query = """
70
121
SELECT id, code
@@ -74,53 +125,67 @@ def menus_transaction(crawled_meals, cursor):
74
125
restaurants = cursor .fetchall ()
75
126
today = datetime .datetime .now (timezone ('Asia/Seoul' )).date ()
76
127
get_menus_query = f"""
77
- SELECT id, restaurant_id, code, date, type, price, etc
128
+ SELECT id, restaurant_id, code, date, type, price, etc, name_kr
78
129
FROM menu
79
130
WHERE date>='{ today .isoformat ()} ';
80
131
"""
81
132
cursor .execute (get_menus_query )
82
133
db_menus = cursor .fetchall ()
83
134
84
- new_menus , deleted_menus = compare_menus (db_menus , crawled_meals , restaurants )
135
+ new_menus , deleted_menus , edited_menus = compare_menus (db_menus , crawled_meals , restaurants )
85
136
86
- print ( f"Deleted Menus: { repr ( deleted_menus ) } " )
137
+ send_deleted_menus_message ( deleted_menus )
87
138
if deleted_menus :
88
- send_slack_message (f"Deleted Menus: { repr (deleted_menus )} " )
89
139
deleted_menus_id = [str (menu .get ('id' )) for menu in deleted_menus ]
90
140
delete_menus_query = f"""
91
141
DELETE FROM menu
92
142
WHERE id in ({ ',' .join (deleted_menus_id )} );
93
143
"""
94
144
cursor .execute (delete_menus_query )
95
145
96
- print (f"New Menus: { repr (new_menus )} " )
97
- new_menus_to_check = list (filter (lambda menu : ':' in menu .get ('name_kr' ), new_menus ))
98
- if new_menus_to_check :
99
- send_slack_message (f"New Menus to be Checked: { repr (new_menus_to_check )} " )
146
+ send_new_menus_message (new_menus )
100
147
insert_menus_query = """
101
148
INSERT INTO menu(restaurant_id, code, date, type, name_kr, price, etc)
102
149
VALUES (%(restaurant_id)s, %(code)s, %(date)s, %(type)s, %(name_kr)s, %(price)s, %(etc)s);
103
150
"""
104
151
cursor .executemany (insert_menus_query , new_menus )
105
152
153
+ send_edited_menus_message (edited_menus )
154
+ edited_menus_query = """
155
+ UPDATE menu
156
+ SET price=%(price)s, etc=%(etc)s, name_kr=%(name_kr)s
157
+ WHERE id=%(id)s;
158
+ """
159
+ cursor .executemany (edited_menus_query , edited_menus )
160
+
106
161
print ("Menus checked" )
107
162
108
163
164
+ async def run_crawlers (crawlers ):
165
+ tasks = [asyncio .create_task (crawler .run_30days ()) for crawler in crawlers ]
166
+ return await asyncio .gather (* tasks , return_exceptions = True )
167
+
168
+
109
169
def crawl (event , context ):
170
+ siksha_db = pymysql .connect (
171
+ user = os .environ .get ('DB_USER' , 'root' ),
172
+ passwd = os .environ .get ('DB_PASSWORD' , 'waffle' ),
173
+ host = os .environ .get ('DB_HOST' , '127.0.0.1' ),
174
+ db = os .environ .get ('DB_NAME' , 'siksha' ),
175
+ charset = 'utf8'
176
+ )
177
+ cursor = siksha_db .cursor (pymysql .cursors .DictCursor )
110
178
try :
111
- print ("start crawling" )
112
- siksha_db = pymysql .connect (
113
- user = os .environ .get ('DB_USER' , 'root' ),
114
- passwd = os .environ .get ('DB_PASSWORD' , 'waffle' ),
115
- host = os .environ .get ('DB_HOST' , '127.0.0.1' ),
116
- db = os .environ .get ('DB_NAME' , 'siksha' ),
117
- charset = 'utf8'
118
- )
119
- cursor = siksha_db .cursor (pymysql .cursors .DictCursor )
120
-
121
- crawled_meals = VetRestaurantCrawler ().run_30days () \
122
- + GraduateDormRestaurantCrawler ().run_30days () \
123
- + SnucoRestaurantCrawler ().run_30days ()
179
+ print ("Start crawling" )
180
+ crawlers = [VetRestaurantCrawler (), SnudormRestaurantCrawler (), SnucoRestaurantCrawler ()]
181
+ results = asyncio .run (run_crawlers (crawlers ))
182
+ for result in results :
183
+ for err in result :
184
+ if err is not None :
185
+ raise err
186
+ crawled_meals = []
187
+ for crawler in crawlers :
188
+ crawled_meals = crawled_meals + crawler .meals
124
189
today = datetime .datetime .now (timezone ('Asia/Seoul' )).date ()
125
190
crawled_meals = list (filter (lambda meal : meal .date >= today , crawled_meals ))
126
191
restaurants_transaction (crawled_meals , cursor )
@@ -130,10 +195,15 @@ def crawl(event, context):
130
195
131
196
send_slack_message ("Crawling has been successfully done" )
132
197
return "Crawling has been successfully done"
133
- except :
198
+ except Exception as e :
134
199
siksha_db .rollback ()
135
- send_slack_message ("crawling has been failed" )
136
- return "crawling has been failed"
200
+ print (e )
201
+ send_slack_message ("Crawling has been failed" )
202
+ return "Crawling has been failed"
203
+ finally :
204
+ cursor .close ()
205
+ siksha_db .close ()
137
206
138
207
139
- #crawl(None, None)
208
+ if __name__ == "__main__" :
209
+ crawl (None , None )
0 commit comments