Skip to content

Commit 8e24341

Browse files
authored
Fix dataset with empty char. (#8469)
* fix dataset with empty char. * revert file links.
1 parent 4bca376 commit 8e24341

File tree

7 files changed

+24
-24
lines changed

7 files changed

+24
-24
lines changed

docs/llm/pretraining/data/OpenWebText2.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
```shell
1515
# wget https://mystic.the-eye.eu/public/AI/pile_preliminary_components/openwebtext2.jsonl.zst.tar
1616
wget https://paddlenlp.bj.bcebos.com/models/transformers/gpt/openwebtext2.jsonl.zst.tar
17-
tar -xvf openwebtext2.json.zst.tar -C /path/to/openwebtext
17+
tar -xvf openwebtext2.jsonl.zst.tar -C /path/to/openwebtext
1818
```
1919

2020
## Llama训练数据制作

llm/README.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -61,20 +61,20 @@ PaddleNLP将飞桨4D并行策略加入到Trainer API中, 用户只需修改Tra
6161
为了方便用户运行测试本模型,本项目提供了处理好的100k条doc的训练样本:
6262
```shell
6363
# llama 模型数据下载
64-
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
65-
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz
64+
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin
65+
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx
6666

6767
# gpt 模型数据下载
68-
# wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy
69-
# wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz
68+
# wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt2_openwebtext_100k.bin
69+
# wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt2_openwebtext_100k.idx
7070
```
7171

7272
将所有预处理得到的文件统一放入一个文件夹中,以备训练使用:
7373

7474
```
7575
mkdir data
76-
mv llama_openwebtext_100k_ids.npy ./data
77-
mv llama_openwebtext_100k_idx.npz ./data
76+
mv llama_openwebtext_100k.bin ./data
77+
mv llama_openwebtext_100k.idx ./data
7878
```
7979

8080
```shell

llm/docs/pretrain.rst

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ git clone 代码到本地,即可开始。
4444
.. code-block:: bash
4545
4646
# llama 模型数据下载
47-
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
48-
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz
47+
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin
48+
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx
4949
5050
# gpt 模型数据下载
5151
# wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy
@@ -57,8 +57,8 @@ git clone 代码到本地,即可开始。
5757
.. code-block:: bash
5858
5959
mkdir data
60-
mv llama_openwebtext_100k_ids.npy ./data
61-
mv llama_openwebtext_100k_idx.npz ./data
60+
mv llama_openwebtext_100k.bin ./data
61+
mv llama_openwebtext_100k.idx ./data
6262
6363
6464

model_zoo/ernie-1.0/preprocess/create_pretraining_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def get_args():
103103
group.add_argument("--append_eos", action="store_true", help="Append an <eos> token to the end of a document.")
104104
group.add_argument("--log_interval", type=int, default=100, help="Interval between progress updates")
105105
group.add_argument("--workers", type=int, default=1, help="Number of worker processes to launch")
106-
group.add_argument("--max_doc_num", type=int, default=sys.maxsize, help="Number of worker processes to launch")
106+
group.add_argument("--max_doc_num", type=int, default=sys.maxsize, help="Stop when reach max_doc_num.")
107107
group.add_argument(
108108
"--max_repeated_len", type=int, default=100, help="The maximum length of the repeated characters to keep"
109109
)

tests/llm/test_pretrain.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@ def test_pretrain(self):
5959
del sys.modules["run_pretrain"]
6060

6161
# Run pretrain
62-
URL = "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy"
63-
URL2 = "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz"
62+
URL = "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin"
63+
URL2 = "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx"
6464
get_path_from_url(URL, root_dir=self.dataset_dir)
6565
get_path_from_url(URL2, root_dir=self.dataset_dir)
6666

tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,11 @@ cd ../../../llm/llama
2424
python -m pip install tool_helpers
2525

2626
rm -rf data && mkdir data
27-
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
28-
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz
27+
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin
28+
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx
2929

30-
mv llama_openwebtext_100k_ids.npy ./data
31-
mv llama_openwebtext_100k_idx.npz ./data
30+
mv llama_openwebtext_100k.bin ./data
31+
mv llama_openwebtext_100k.idx ./data
3232

3333
# mv autoconfig
3434
rm -rf autoconfig

tests/trainer/test_unified_checkpoint.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -186,8 +186,8 @@ def setUp(self):
186186
os.environ.update(environment_variables)
187187

188188
files = [
189-
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy",
190-
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz",
189+
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin",
190+
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx",
191191
]
192192
self.prepare_inputs_data(pretrain_arguments["input_dir"], files)
193193

@@ -651,8 +651,8 @@ def setUp(self):
651651
os.environ.update(environment_variables)
652652

653653
files = [
654-
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy",
655-
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz",
654+
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin",
655+
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx",
656656
]
657657
self.prepare_inputs_data(pretrain_arguments["input_dir"], files)
658658

@@ -693,8 +693,8 @@ def setUp(self):
693693
os.environ.update(environment_variables)
694694

695695
files = [
696-
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy",
697-
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz",
696+
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin",
697+
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx",
698698
]
699699
self.prepare_inputs_data(pretrain_arguments["input_dir"], files)
700700

0 commit comments

Comments
 (0)