Skip to content

Commit 2803202

Browse files
yy462yuyang2S2023
andauthored
Update downloading cora dataset path (#36)
* update the downloading path of cora dataset * Add ogbn dataset * Update data_process.py * Fix issues detected by pre-commit --------- Co-authored-by: Yu Yang <[email protected]>
1 parent c24a809 commit 2803202

File tree

2 files changed

+53
-9
lines changed

2 files changed

+53
-9
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
# Federated Graph Learning [![PyPI Downloads](https://static.pepy.tech/badge/fedgraph)](https://pepy.tech/projects/fedgraph)
22

3-
[pypi-url]: https://pypi.python.org/pypi/fedgraph
3+
[pypi-url]: https://pypi.python.org/pypi/fedgraph
44

5-
**[Documentation](https://docs.fedgraph.org)** | **[Paper](https://arxiv.org/abs/2410.06340)** | **[Slack](https://join.slack.com/t/fedgraphlibrary/shared_invite/zt-2wztvbo1v-DO81DnUD86q066mxnQuWWw)**
5+
**[Documentation](https://docs.fedgraph.org)** | **[Paper](https://arxiv.org/abs/2410.06340)** | **[Slack](https://join.slack.com/t/fedgraphlibrary/shared_invite/zt-2wztvbo1v-DO81DnUD86q066mxnQuWWw)**
66

77
**FedGraph** *(Federated Graph)* is a library built on top of [PyTorch Geometric (PyG)](https://www.pyg.org/),
88
[Ray](https://docs.ray.io/), and [PyTorch](https://pytorch.org/) to easily train Graph Neural Networks

fedgraph/data_process.py

Lines changed: 51 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# setting of data generation
22

3+
import os
34
import pickle as pkl
45
import random
56
import sys
@@ -10,6 +11,7 @@
1011
import networkx as nx
1112
import numpy as np
1213
import pandas as pd
14+
import requests
1315
import scipy.sparse as sp
1416
import torch
1517
import torch_geometric
@@ -178,6 +180,31 @@ def NC_parse_index_file(filename: str) -> list:
178180
return index
179181

180182

183+
def download_file_from_github(url: str, save_path: str):
184+
"""
185+
Downloads a file from a GitHub URL and saves it to a specified local path.
186+
Note
187+
----
188+
- The function downloads files in chunks to handle large files efficiently.
189+
- If the file already exists at `save_path`, it will not be downloaded again.
190+
"""
191+
if not os.path.exists(save_path):
192+
print(f"Downloading {url} to {save_path}...")
193+
response = requests.get(url, stream=True)
194+
if response.status_code == 200:
195+
with open(save_path, "wb") as f:
196+
for chunk in response.iter_content(chunk_size=1024):
197+
if chunk:
198+
f.write(chunk)
199+
print(f"Downloaded {save_path}")
200+
else:
201+
raise Exception(
202+
f"Failed to download {url}. HTTP Status Code: {response.status_code}"
203+
)
204+
else:
205+
print(f"File already exists: {save_path}")
206+
207+
181208
def NC_load_data(dataset_str: str) -> tuple:
182209
"""
183210
Loads input data from 'gcn/data' directory and processes these datasets into a format
@@ -217,21 +244,38 @@ def NC_load_data(dataset_str: str) -> tuple:
217244
"""
218245
if dataset_str in ["cora", "citeseer", "pubmed"]:
219246
# download dataset from torch_geometric
220-
dataset = torch_geometric.datasets.Planetoid("./data", dataset_str)
221-
names = ["x", "y", "tx", "ty", "allx", "ally", "graph"]
247+
BASE_URL = "https://github.com/kimiyoung/planetoid/raw/master/data"
248+
DATA_DIR = f"./data/{dataset_str}/raw/"
249+
os.makedirs(DATA_DIR, exist_ok=True)
250+
251+
filenames = [
252+
f"ind.{dataset_str}.x",
253+
f"ind.{dataset_str}.tx",
254+
f"ind.{dataset_str}.allx",
255+
f"ind.{dataset_str}.y",
256+
f"ind.{dataset_str}.ty",
257+
f"ind.{dataset_str}.ally",
258+
f"ind.{dataset_str}.graph",
259+
f"ind.{dataset_str}.test.index",
260+
]
261+
262+
for filename in filenames:
263+
file_url = f"{BASE_URL}/{filename}"
264+
save_path = os.path.join(DATA_DIR, filename)
265+
download_file_from_github(file_url, save_path)
266+
222267
objects = []
223-
for i in range(len(names)):
224-
with open(
225-
"data/{}/raw/ind.{}.{}".format(dataset_str, dataset_str, names[i]), "rb"
226-
) as f:
268+
for name in ["x", "y", "tx", "ty", "allx", "ally", "graph"]:
269+
file_path = os.path.join(DATA_DIR, f"ind.{dataset_str}.{name}")
270+
with open(file_path, "rb") as f:
227271
if sys.version_info > (3, 0):
228272
objects.append(pkl.load(f, encoding="latin1"))
229273
else:
230274
objects.append(pkl.load(f))
231275

232276
x, y, tx, ty, allx, ally, graph = tuple(objects)
233277
test_idx_reorder = NC_parse_index_file(
234-
"data/{}/raw/ind.{}.test.index".format(dataset_str, dataset_str)
278+
os.path.join(DATA_DIR, f"ind.{dataset_str}.test.index")
235279
)
236280
test_idx_range = np.sort(test_idx_reorder)
237281

0 commit comments

Comments
 (0)