Skip to content

Commit 40d6309

Browse files
authored
Merge pull request #10 from wafflestudio/develop
Develop
2 parents 365939e + 3125cf5 commit 40d6309

File tree

4 files changed

+250
-127
lines changed

4 files changed

+250
-127
lines changed

handler.py

Lines changed: 110 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44
import datetime
55
from pytz import timezone
66
from itertools import compress
7+
import asyncio
78
from slack import send_slack_message
8-
from menu_crawler import text_normalizer, VetRestaurantCrawler, GraduateDormRestaurantCrawler, SnucoRestaurantCrawler
9+
from menu_crawler import text_normalizer, VetRestaurantCrawler, SnudormRestaurantCrawler, SnucoRestaurantCrawler
910

1011

1112
def compare_restaurants(db_restaurants, crawled_meals):
@@ -22,8 +23,20 @@ def compare_restaurants(db_restaurants, crawled_meals):
2223
return new_restaurants
2324

2425

26+
def remove_duplicate(menus):
27+
unique_fields = ['restaurant_id', 'code', 'date', 'type']
28+
unique = [True] * len(menus)
29+
for i in range(len(menus)):
30+
for j in range(i):
31+
if all((menus[i].get(field) == menus[j].get(field)) for field in unique_fields):
32+
unique[i] = False
33+
break
34+
return list(compress(menus, unique))
35+
36+
2537
def compare_menus(db_menus, crawled_meals, restaurants):
26-
fields = ['restaurant_id', 'code', 'date', 'type', 'price', 'etc']
38+
unique_fields = ['restaurant_id', 'code', 'date', 'type']
39+
detail_fields = ['price', 'etc']
2740
restaurant_dict = {restaurant.get('code'): restaurant.get('id') for restaurant in restaurants}
2841
crawled_menus = [meal.as_dict() for meal in crawled_meals]
2942
for menu in crawled_menus:
@@ -33,14 +46,33 @@ def compare_menus(db_menus, crawled_meals, restaurants):
3346
menu['name_kr'] = name
3447
menu['code'] = text_normalizer(name, True)
3548

49+
crawled_menus = remove_duplicate(crawled_menus)
50+
3651
db_not_found = [True] * len(db_menus)
3752
crawled_not_found = [True] * len(crawled_menus)
53+
edited = [False] * len(db_menus)
3854
for db_idx in range(len(db_menus)):
3955
for crawled_idx in range(len(crawled_menus)):
40-
if all((db_menus[db_idx].get(field, None) == crawled_menus[crawled_idx].get(field)) for field in fields):
56+
if all((db_menus[db_idx].get(field, None) == crawled_menus[crawled_idx].get(field)) for field in unique_fields):
4157
db_not_found[db_idx] = False
4258
crawled_not_found[crawled_idx] = False
43-
return list(compress(crawled_menus, crawled_not_found)), list(compress(db_menus, db_not_found))
59+
for field in detail_fields:
60+
if db_menus[db_idx].get(field, None) != crawled_menus[crawled_idx].get(field):
61+
edited[db_idx] = True
62+
db_menus[db_idx]['previous_' + field] = db_menus[db_idx].pop(field, None)
63+
db_menus[db_idx][field] = crawled_menus[crawled_idx].get(field)
64+
break
65+
return list(compress(crawled_menus, crawled_not_found)), list(compress(db_menus, db_not_found)), \
66+
list(compress(db_menus, edited))
67+
68+
69+
def send_new_restaurants_message(new_restaurants):
70+
print(f"New restaurants: {repr(new_restaurants)}")
71+
if new_restaurants:
72+
slack_message = f"{len(new_restaurants)} new restaurants found: "
73+
for restaurant in new_restaurants:
74+
slack_message = slack_message + '"' + restaurant.get('name_kr') + '" '
75+
send_slack_message(slack_message)
4476

4577

4678
def restaurants_transaction(crawled_meals, cursor):
@@ -51,20 +83,39 @@ def restaurants_transaction(crawled_meals, cursor):
5183
cursor.execute(get_restaurants_query)
5284
db_restaurants = cursor.fetchall()
5385
new_restaurants = compare_restaurants(db_restaurants, crawled_meals)
54-
print(f"New Restaurants: {repr(new_restaurants)}")
55-
if new_restaurants:
56-
slack_message = "New Restaurant(s) Found: "
57-
for restaurant in new_restaurants:
58-
slack_message = slack_message + '"' + restaurant.get('name_kr') + '" '
59-
send_slack_message(slack_message)
60-
insert_restaurants_query = """
61-
INSERT INTO restaurant(code, name_kr)
62-
VALUES (%(code)s, %(name_kr)s);
63-
"""
64-
cursor.executemany(insert_restaurants_query, new_restaurants)
86+
send_new_restaurants_message(new_restaurants)
87+
insert_restaurants_query = """
88+
INSERT INTO restaurant(code, name_kr)
89+
VALUES (%(code)s, %(name_kr)s);
90+
"""
91+
cursor.executemany(insert_restaurants_query, new_restaurants)
6592
print("Restaurants checked")
6693

6794

95+
def send_deleted_menus_message(deleted_menus):
96+
print(f"Menus deleted: {repr(deleted_menus)}")
97+
if deleted_menus:
98+
send_slack_message(f"{len(deleted_menus)} menus deleted: {repr(deleted_menus)}")
99+
100+
101+
def send_new_menus_message(new_menus):
102+
slack_message = f"{len(new_menus)} new menus found: "
103+
for menu in new_menus:
104+
name_kr = menu.get('name_kr')
105+
if ':' in name_kr:
106+
slack_message = slack_message + '*"' + menu.get('name_kr') + '"* '
107+
else:
108+
slack_message = slack_message + '"' + menu.get('name_kr') + '" '
109+
send_slack_message(slack_message)
110+
print(f"New menus found: {repr(new_menus)}")
111+
112+
113+
def send_edited_menus_message(edited_menus):
114+
print(f"Menus edited: {repr(edited_menus)}")
115+
if edited_menus:
116+
send_slack_message(f"{len(edited_menus)} menus edited: {repr(edited_menus)}")
117+
118+
68119
def menus_transaction(crawled_meals, cursor):
69120
get_restaurants_query = """
70121
SELECT id, code
@@ -74,53 +125,67 @@ def menus_transaction(crawled_meals, cursor):
74125
restaurants = cursor.fetchall()
75126
today = datetime.datetime.now(timezone('Asia/Seoul')).date()
76127
get_menus_query = f"""
77-
SELECT id, restaurant_id, code, date, type, price, etc
128+
SELECT id, restaurant_id, code, date, type, price, etc, name_kr
78129
FROM menu
79130
WHERE date>='{today.isoformat()}';
80131
"""
81132
cursor.execute(get_menus_query)
82133
db_menus = cursor.fetchall()
83134

84-
new_menus, deleted_menus = compare_menus(db_menus, crawled_meals, restaurants)
135+
new_menus, deleted_menus, edited_menus = compare_menus(db_menus, crawled_meals, restaurants)
85136

86-
print(f"Deleted Menus: {repr(deleted_menus)}")
137+
send_deleted_menus_message(deleted_menus)
87138
if deleted_menus:
88-
send_slack_message(f"Deleted Menus: {repr(deleted_menus)}")
89139
deleted_menus_id = [str(menu.get('id')) for menu in deleted_menus]
90140
delete_menus_query = f"""
91141
DELETE FROM menu
92142
WHERE id in ({','.join(deleted_menus_id)});
93143
"""
94144
cursor.execute(delete_menus_query)
95145

96-
print(f"New Menus: {repr(new_menus)}")
97-
new_menus_to_check = list(filter(lambda menu: ':' in menu.get('name_kr'), new_menus))
98-
if new_menus_to_check:
99-
send_slack_message(f"New Menus to be Checked: {repr(new_menus_to_check)}")
146+
send_new_menus_message(new_menus)
100147
insert_menus_query = """
101148
INSERT INTO menu(restaurant_id, code, date, type, name_kr, price, etc)
102149
VALUES (%(restaurant_id)s, %(code)s, %(date)s, %(type)s, %(name_kr)s, %(price)s, %(etc)s);
103150
"""
104151
cursor.executemany(insert_menus_query, new_menus)
105152

153+
send_edited_menus_message(edited_menus)
154+
edited_menus_query = """
155+
UPDATE menu
156+
SET price=%(price)s, etc=%(etc)s, name_kr=%(name_kr)s
157+
WHERE id=%(id)s;
158+
"""
159+
cursor.executemany(edited_menus_query, edited_menus)
160+
106161
print("Menus checked")
107162

108163

164+
async def run_crawlers(crawlers):
165+
tasks = [asyncio.create_task(crawler.run_30days()) for crawler in crawlers]
166+
return await asyncio.gather(*tasks, return_exceptions=True)
167+
168+
109169
def crawl(event, context):
170+
siksha_db = pymysql.connect(
171+
user=os.environ.get('DB_USER', 'root'),
172+
passwd=os.environ.get('DB_PASSWORD', 'waffle'),
173+
host=os.environ.get('DB_HOST', '127.0.0.1'),
174+
db=os.environ.get('DB_NAME', 'siksha'),
175+
charset='utf8'
176+
)
177+
cursor = siksha_db.cursor(pymysql.cursors.DictCursor)
110178
try:
111-
print("start crawling")
112-
siksha_db = pymysql.connect(
113-
user=os.environ.get('DB_USER', 'root'),
114-
passwd=os.environ.get('DB_PASSWORD', 'waffle'),
115-
host=os.environ.get('DB_HOST', '127.0.0.1'),
116-
db=os.environ.get('DB_NAME', 'siksha'),
117-
charset='utf8'
118-
)
119-
cursor = siksha_db.cursor(pymysql.cursors.DictCursor)
120-
121-
crawled_meals = VetRestaurantCrawler().run_30days() \
122-
+ GraduateDormRestaurantCrawler().run_30days() \
123-
+ SnucoRestaurantCrawler().run_30days()
179+
print("Start crawling")
180+
crawlers = [VetRestaurantCrawler(), SnudormRestaurantCrawler(), SnucoRestaurantCrawler()]
181+
results = asyncio.run(run_crawlers(crawlers))
182+
for result in results:
183+
for err in result:
184+
if err is not None:
185+
raise err
186+
crawled_meals = []
187+
for crawler in crawlers:
188+
crawled_meals = crawled_meals + crawler.meals
124189
today = datetime.datetime.now(timezone('Asia/Seoul')).date()
125190
crawled_meals = list(filter(lambda meal: meal.date >= today, crawled_meals))
126191
restaurants_transaction(crawled_meals, cursor)
@@ -130,10 +195,15 @@ def crawl(event, context):
130195

131196
send_slack_message("Crawling has been successfully done")
132197
return "Crawling has been successfully done"
133-
except:
198+
except Exception as e:
134199
siksha_db.rollback()
135-
send_slack_message("crawling has been failed")
136-
return "crawling has been failed"
200+
print(e)
201+
send_slack_message("Crawling has been failed")
202+
return "Crawling has been failed"
203+
finally:
204+
cursor.close()
205+
siksha_db.close()
137206

138207

139-
#crawl(None, None)
208+
if __name__ == "__main__":
209+
crawl(None, None)

0 commit comments

Comments
 (0)