Skip to content

Commit 528ae7a

Browse files
author
naibo
committed
JSON Format Support
1 parent 7c5f9a4 commit 528ae7a

File tree

8 files changed

+65
-25
lines changed

8 files changed

+65
-25
lines changed

ElectronJS/src/taskGrid/FlowChart.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -639,6 +639,7 @@ <h4 class="modal-title" id="myModalLabel">Save Task (Can press Ctrl + S to open
639639
<option value = "xlsx">XLSX (EXCEL, note that a single Excel cell can save up to 32767 characters)</option>
640640
<option value = "csv">CSV</option>
641641
<option value = "txt">TXT</option>
642+
<option value = "json">JSON</option>
642643
<option value = "mysql">MySQL Database</option>
643644
</select>
644645
<label>Export File Name/Database Table Name (Can use ../ to represent relative path to change the file save location,the keyword "current_time" will be replaced with the timestamp when the task is executed):</label>

ElectronJS/src/taskGrid/FlowChart_CN.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -640,6 +640,7 @@ <h4 class="modal-title" id="myModalLabel">保存任务(可按Ctrl+S调出此
640640
<option value = "xlsx">XLSX(即EXCEL文件,注意Excel单个单元格最多可存储32767字符)</option>
641641
<option value = "csv">CSV</option>
642642
<option value = "txt">TXT</option>
643+
<option value = "json">JSON</option>
643644
<option value = "mysql">MySQL数据库</option>
644645
</select>
645646
<label>导出文件名/数据库表格名称(可使用../表示相对路径以改变文件保存位置,名称中的“current_time”会被替换为执行任务时的时间戳):</label>

ElectronJS/tasks/112.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

ElectronJS/tasks/200.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"id":200,"name":"图片下载","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"7/24/2023, 1:33:01 PM","update_time":"7/24/2023, 2:08:02 PM","version":"0.5.0","saveThreshold":10,"cloudflare":0,"environment":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_图片地址","desc":"","type":"text","recordASField":1,"exampleValue":"//m.360buyimg.com/babel/jfs/t1/160456/7/37206/196421/649c09faFeab01f59/cc5ea5b81653b3a5.png"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div/div[1]/div[1]/a[1]/img[1]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/a[1]/img[1]","//img[contains(., '')]","/html/body/div[last()-5]/div/div[last()-4]/div/div[last()-1]/div/div[last()-1]/div/div[last()-1]/div/div[last()-8]/div/div/a/img"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"paras":[{"nodeType":4,"contentType":0,"relative":true,"name":"参数1_图片地址","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"//m.360buyimg.com/babel/jfs/t1/160456/7/37206/196421/649c09faFeab01f59/cc5ea5b81653b3a5.png"}],"unique_index":"wujk9g2wu7lkgfly5m","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"arguments[0].src = arguments[0].src.replace(\"a\",\"b\")","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":1}],"loopType":1}}]}

ElectronJS/tasks/201.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"id":201,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"7/24/2023, 2:03:04 PM","update_time":"7/24/2023, 2:06:26 PM","version":"0.5.0","saveThreshold":10,"cloudflare":0,"environment":0,"maxViewLength":15,"recordLog":1,"outputFormat":"json","saveName":"TEST","inputExcel":"","startFromExit":0,"containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","recordASField":1,"exampleValue":"手机"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","recordASField":1,"exampleValue":"https://shouji.jd.com/"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/a[1]","//a[contains(., '手机')]","/html/body/div[last()-6]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-12]/a[last()-1]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"paras":[{"nodeType":1,"contentType":0,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"手机"}],"unique_index":"g8qhgp3k42llkggokun","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://shouji.jd.com/"}],"unique_index":"g8qhgp3k42llkggokun","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}

ExecuteStage/.vscode/launch.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
"justMyCode": false,
1313
// "args": ["--id", "[7]", "--read_type", "remote", "--headless", "0"]
1414
// "args": ["--id", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
15-
"args": ["--id", "[8]", "--headless", "0", "--user_data", "1", "--keyboard", "0"]
15+
"args": ["--id", "[16]", "--headless", "0", "--user_data", "1", "--keyboard", "0"]
1616
}
1717
]
1818
}

ExecuteStage/easyspider_executestage.py

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# -*- coding: utf-8 -*-
22
# import atexit
3-
from utils import download_image, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, on_press_creator, on_release_creator, replace_field_values, write_to_csv, write_to_excel
3+
from utils import download_image, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, on_press_creator, on_release_creator, replace_field_values, write_to_csv, write_to_excel, write_to_json
44
from myChrome import MyChrome
55
from threading import Thread, Event
66
from PIL import Image
@@ -152,27 +152,24 @@ def __init__(self, browser_t, id, service, version, event, saveName, config):
152152
filter(isnotnull, service["links"].split("\n"))) # 要执行的link的列表
153153
self.OUTPUT = [] # 采集的数据
154154
self.writeMode = 1 # 写入模式,0为新建,1为追加
155-
if self.outputFormat == "csv" or self.outputFormat == "txt":
155+
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx":
156156
if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
157157
self.OUTPUT.append([]) # 添加表头
158158
self.writeMode = 0
159-
elif self.outputFormat == "xlsx":
160-
if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.xlsx'):
161-
self.OUTPUT.append([]) # 添加表头
162-
self.writeMode = 0
159+
elif self.outputFormat == "json":
160+
self.writeMode = 3 # JSON模式无需判断是否存在文件
163161
elif self.outputFormat == "mysql":
164162
self.mysql = myMySQL(config["mysql_config_path"])
165163
self.mysql.create_table(self.saveName, service["outputParameters"])
166164
self.writeMode = 2
167-
if self.writeMode == 1:
168-
self.print_and_log("追加模式")
169-
self.print_and_log("Append Mode")
170-
elif self.writeMode == 0:
171-
self.print_and_log("新建模式")
172-
self.print_and_log("New Mode")
165+
if self.writeMode == 0:
166+
self.print_and_log("新建模式|Create Mode")
167+
elif self.writeMode == 1:
168+
self.print_and_log("追加模式|Append Mode")
173169
elif self.writeMode == 2:
174-
self.print_and_log("MySQL模式")
175-
self.print_and_log("MySQL Mode")
170+
self.print_and_log("MySQL模式|MySQL Mode")
171+
elif self.writeMode == 3:
172+
self.print_and_log("JSON模式|JSON Mode")
176173
self.containJudge = service["containJudge"] # 是否含有判断语句
177174
self.outputParameters = {}
178175
self.service = service
@@ -401,6 +398,10 @@ def saveData(self, exit=False):
401398
str(self.id) + "/" + self.saveName + '.xlsx'
402399
write_to_excel(
403400
file_name, self.OUTPUT, self.outputParametersTypes, self.outputParametersRecord)
401+
elif self.outputFormat == "json":
402+
file_name = "Data/Task_" + \
403+
str(self.id) + "/" + self.saveName + '.json'
404+
write_to_json(file_name, self.OUTPUT, self.outputParametersTypes, self.outputParametersRecord, self.outputParameters.keys())
404405
elif self.outputFormat == "mysql":
405406
self.mysql.write_to_mysql(
406407
self.OUTPUT, self.outputParametersRecord, self.outputParametersTypes)
@@ -1395,7 +1396,7 @@ def get_content(self, p, element):
13951396
except:
13961397
downloadPic = 0
13971398
if downloadPic == 1:
1398-
download_image(content, "Data/Task_" +
1399+
download_image(self, content, "Data/Task_" +
13991400
str(self.id) + "/" + self.saveName + "/")
14001401
else: # 普通节点
14011402
content = element.text
@@ -1420,7 +1421,7 @@ def get_content(self, p, element):
14201421
except:
14211422
downloadPic = 0
14221423
if downloadPic == 1:
1423-
download_image(content, "Data/Task_" +
1424+
download_image(self, content, "Data/Task_" +
14241425
str(self.id) + "/" + self.saveName + "/")
14251426
else:
14261427
command = 'var arr = [];\

ExecuteStage/utils.py

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def on_release(key):
9595
# time.sleep(1) # 每秒检查一次
9696

9797

98-
def download_image(url, save_directory):
98+
def download_image(browser, url, save_directory):
9999
# 定义浏览器头信息
100100
headers = {
101101
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
@@ -120,15 +120,15 @@ def download_image(url, save_directory):
120120
with open(save_path, 'wb') as file:
121121
file.write(response.content)
122122

123-
print("图片已成功下载到:", save_path)
124-
print("The image has been successfully downloaded to:", save_path)
123+
browser.print_and_log("图片已成功下载到:", save_path)
124+
browser.print_and_log("The image has been successfully downloaded to:", save_path)
125125
else:
126-
print("下载图片失败,请检查此图片链接是否有效:", url)
127-
print(
126+
browser.print_and_log("下载图片失败,请检查此图片链接是否有效:", url)
127+
browser.print_and_log(
128128
"Failed to download image, please check if this image link is valid:", url)
129129
else:
130-
print("下载图片失败,请检查此图片链接是否有效:", url)
131-
print("Failed to download image, please check if this image link is valid:", url)
130+
browser.print_and_log("下载图片失败,请检查此图片链接是否有效:", url)
131+
browser.print_and_log("Failed to download image, please check if this image link is valid:", url)
132132

133133

134134
def get_output_code(output):
@@ -182,6 +182,41 @@ def replace_field_values(orginal_text, outputParameters):
182182
return replaced_text
183183

184184

185+
def write_to_json(file_name, data, types, record, keys):
186+
keys = list(keys)
187+
# Prepare empty list for data
188+
data_to_write = []
189+
# Tranform data and append to list
190+
for line in data:
191+
to_write = {}
192+
for i in range(len(line)):
193+
if types[i] == "int" or types[i] == "bigInt":
194+
try:
195+
line[i] = int(line[i])
196+
except:
197+
line[i] = 0
198+
elif types[i] == "double":
199+
try:
200+
line[i] = float(line[i])
201+
except:
202+
line[i] = 0.0
203+
if record[i]:
204+
to_write.update({keys[i]: line[i]})
205+
data_to_write.append(to_write)
206+
207+
try:
208+
# read data from JSON
209+
with open(file_name, 'r', encoding='utf-8') as f:
210+
json_data = json.load(f)
211+
except:
212+
json_data = []
213+
214+
json_data.extend(data_to_write)
215+
216+
# write data to JSON
217+
with open(file_name, 'w', encoding='utf-8') as f:
218+
json.dump(json_data, f, ensure_ascii=False)
219+
185220
def write_to_excel(file_name, data, types, record):
186221
first = False
187222
if os.path.exists(file_name):

0 commit comments

Comments
 (0)