1
1
import json
2
2
import pymysql
3
3
import os
4
- import random
4
+ import datetime
5
+ from pytz import timezone
6
+ from itertools import compress
5
7
from slack import send_slack_message
6
- from menu_crawler import VetRestaurantCrawler , GraduateDormRestaurantCrawler , SnucoRestaurantCrawler
8
+ from menu_crawler import text_normalizer , VetRestaurantCrawler , GraduateDormRestaurantCrawler , SnucoRestaurantCrawler
9
+
10
+
11
+ def compare_restaurants (db_restaurants , crawled_meals ):
12
+ codes = [restaurant .get ('code' ) for restaurant in db_restaurants ]
13
+ new_restaurants = []
14
+ for meal in crawled_meals :
15
+ code = text_normalizer (meal .restaurant , True )
16
+ if code not in codes :
17
+ new_restaurants .append (dict (
18
+ code = code ,
19
+ name_kr = meal .restaurant ,
20
+ ))
21
+ codes .append (code )
22
+ return new_restaurants
23
+
24
+
25
+ def compare_menus (db_menus , crawled_meals , restaurants ):
26
+ fields = ['restaurant_id' , 'code' , 'date' , 'type' , 'price' , 'etc' ]
27
+ restaurant_dict = {restaurant .get ('code' ): restaurant .get ('id' ) for restaurant in restaurants }
28
+ crawled_menus = [meal .as_dict () for meal in crawled_meals ]
29
+ for menu in crawled_menus :
30
+ restaurant_code = text_normalizer (menu .pop ('restaurant' ), True )
31
+ menu ['restaurant_id' ] = restaurant_dict .get (restaurant_code )
32
+ name = menu .pop ('name' )
33
+ menu ['name_kr' ] = name
34
+ menu ['code' ] = text_normalizer (name , True )
35
+
36
+ db_not_found = [True ] * len (db_menus )
37
+ crawled_not_found = [True ] * len (crawled_menus )
38
+ for db_idx in range (len (db_menus )):
39
+ for crawled_idx in range (len (crawled_menus )):
40
+ if all ((db_menus [db_idx ].get (field , None ) == crawled_menus [crawled_idx ].get (field )) for field in fields ):
41
+ db_not_found [db_idx ] = False
42
+ crawled_not_found [crawled_idx ] = False
43
+ return list (compress (crawled_menus , crawled_not_found )), list (compress (db_menus , db_not_found ))
44
+
45
+
46
+ def restaurants_transaction (crawled_meals , cursor ):
47
+ get_restaurants_query = """
48
+ SELECT code
49
+ FROM restaurant;
50
+ """
51
+ cursor .execute (get_restaurants_query )
52
+ db_restaurants = cursor .fetchall ()
53
+ new_restaurants = compare_restaurants (db_restaurants , crawled_meals )
54
+ print (f"New Restaurants: { repr (new_restaurants )} " )
55
+ if new_restaurants :
56
+ slack_message = "New Restaurant(s) Found: "
57
+ for restaurant in new_restaurants :
58
+ slack_message = slack_message + '"' + restaurant .get ('name_kr' ) + '" '
59
+ send_slack_message (slack_message )
60
+ insert_restaurants_query = """
61
+ INSERT INTO restaurant(code, name_kr)
62
+ VALUES (%(code)s, %(name_kr)s);
63
+ """
64
+ cursor .executemany (insert_restaurants_query , new_restaurants )
65
+ print ("Restaurants checked" )
66
+
67
+
68
+ def menus_transaction (crawled_meals , cursor ):
69
+ get_restaurants_query = """
70
+ SELECT id, code
71
+ FROM restaurant;
72
+ """
73
+ cursor .execute (get_restaurants_query )
74
+ restaurants = cursor .fetchall ()
75
+ today = datetime .datetime .now (timezone ('Asia/Seoul' )).date ()
76
+ get_menus_query = f"""
77
+ SELECT id, restaurant_id, code, date, type, price, etc
78
+ FROM menu
79
+ WHERE date>='{ today .isoformat ()} ';
80
+ """
81
+ cursor .execute (get_menus_query )
82
+ db_menus = cursor .fetchall ()
83
+
84
+ new_menus , deleted_menus = compare_menus (db_menus , crawled_meals , restaurants )
85
+
86
+ print (f"Deleted Menus: { repr (deleted_menus )} " )
87
+ if deleted_menus :
88
+ send_slack_message (f"Deleted Menus: { repr (deleted_menus )} " )
89
+ deleted_menus_id = [str (menu .get ('id' )) for menu in deleted_menus ]
90
+ delete_menus_query = f"""
91
+ DELETE FROM menu
92
+ WHERE id in ({ ',' .join (deleted_menus_id )} );
93
+ """
94
+ cursor .execute (delete_menus_query )
95
+
96
+ print (f"New Menus: { repr (new_menus )} " )
97
+ new_menus_to_check = list (filter (lambda menu : ':' in menu .get ('name_kr' ), new_menus ))
98
+ if new_menus_to_check :
99
+ send_slack_message (f"New Menus to be Checked: { repr (new_menus_to_check )} " )
100
+ insert_menus_query = """
101
+ INSERT INTO menu(restaurant_id, code, date, type, name_kr, price, etc)
102
+ VALUES (%(restaurant_id)s, %(code)s, %(date)s, %(type)s, %(name_kr)s, %(price)s, %(etc)s);
103
+ """
104
+ cursor .executemany (insert_menus_query , new_menus )
105
+
106
+ print ("Menus checked" )
107
+
7
108
8
109
def crawl (event , context ):
9
110
try :
111
+ print ("start crawling" )
10
112
siksha_db = pymysql .connect (
11
113
user = os .environ .get ('DB_USER' , 'root' ),
12
114
passwd = os .environ .get ('DB_PASSWORD' , 'waffle' ),
@@ -15,35 +117,23 @@ def crawl(event, context):
15
117
charset = 'utf8'
16
118
)
17
119
cursor = siksha_db .cursor (pymysql .cursors .DictCursor )
18
- # TRANSACTION START
19
- get_restaurants_query = """
20
- SELECT *
21
- FROM restaurant
22
- """
23
- cursor .execute (get_restaurants_query )
24
- restaurants = cursor .fetchall ()
25
- print ('log using stdout' )
26
- print (f'get restaurants result: { repr (restaurants )} ' )
27
- insert_restaurants_query = """
28
- INSERT INTO restaurant(code, name_kr, name_en, addr, lat, lng)
29
- VALUES (%(code)s, %(name_kr)s, %(name_en)s, %(addr)s, %(lat)s, %(lng)s);
30
- """
31
- new_restaurants = [
32
- dict (
33
- code = f"test{ random .random ()} " ,
34
- name_kr = "한글명" ,
35
- name_en = "영어명" ,
36
- addr = "한글주소" ,
37
- lat = 0 ,
38
- lng = 0
39
- ) for i in range (10 )
40
- ]
41
- cursor .executemany (insert_restaurants_query , new_restaurants )
42
- # TRANSACTION END
120
+
121
+ crawled_meals = VetRestaurantCrawler ().run_30days () \
122
+ + GraduateDormRestaurantCrawler ().run_30days () \
123
+ + SnucoRestaurantCrawler ().run_30days ()
124
+ today = datetime .datetime .now (timezone ('Asia/Seoul' )).date ()
125
+ crawled_meals = list (filter (lambda meal : meal .date >= today , crawled_meals ))
126
+ restaurants_transaction (crawled_meals , cursor )
127
+ siksha_db .commit ()
128
+ menus_transaction (crawled_meals , cursor )
43
129
siksha_db .commit ()
44
- send_slack_message ("crawling has been successfully done" )
45
- return "crawling has been successfully done"
130
+
131
+ send_slack_message ("Crawling has been successfully done" )
132
+ return "Crawling has been successfully done"
46
133
except :
47
134
siksha_db .rollback ()
48
135
send_slack_message ("crawling has been failed" )
49
136
return "crawling has been failed"
137
+
138
+
139
+ #crawl(None, None)
0 commit comments