|
4 | 4 | "cell_type": "markdown",
|
5 | 5 | "metadata": {
|
6 | 6 | "application/vnd.databricks.v1+cell": {
|
7 |
| - "cellMetadata": {}, |
| 7 | + "cellMetadata": { |
| 8 | + "byteLimit": 2048000, |
| 9 | + "rowLimit": 10000 |
| 10 | + }, |
8 | 11 | "inputWidgets": {},
|
9 | 12 | "nuid": "f275a21b-47d4-472c-972b-e2a84a597db2",
|
10 | 13 | "showTitle": false,
|
|
54 | 57 | "cell_type": "markdown",
|
55 | 58 | "metadata": {
|
56 | 59 | "application/vnd.databricks.v1+cell": {
|
57 |
| - "cellMetadata": {}, |
| 60 | + "cellMetadata": { |
| 61 | + "byteLimit": 2048000, |
| 62 | + "rowLimit": 10000 |
| 63 | + }, |
58 | 64 | "inputWidgets": {},
|
59 | 65 | "nuid": "3d08a21c-9f5a-4ad2-af85-e016335cc53d",
|
60 | 66 | "showTitle": false,
|
|
173 | 179 | "import re\n",
|
174 | 180 | "import json\n",
|
175 | 181 | "import tempfile\n",
|
| 182 | + "import random\n", |
176 | 183 | "import numpy as np\n",
|
177 | 184 | "import pandas as pd \n",
|
178 | 185 | "from collections import defaultdict\n",
|
|
193 | 200 | "cell_type": "markdown",
|
194 | 201 | "metadata": {
|
195 | 202 | "application/vnd.databricks.v1+cell": {
|
196 |
| - "cellMetadata": {}, |
| 203 | + "cellMetadata": { |
| 204 | + "byteLimit": 2048000, |
| 205 | + "rowLimit": 10000 |
| 206 | + }, |
197 | 207 | "inputWidgets": {},
|
198 | 208 | "nuid": "3a513cdd-967d-4a87-b56f-340053fa79cd",
|
199 | 209 | "showTitle": false,
|
|
208 | 218 | "cell_type": "markdown",
|
209 | 219 | "metadata": {
|
210 | 220 | "application/vnd.databricks.v1+cell": {
|
211 |
| - "cellMetadata": {}, |
| 221 | + "cellMetadata": { |
| 222 | + "byteLimit": 2048000, |
| 223 | + "rowLimit": 10000 |
| 224 | + }, |
212 | 225 | "inputWidgets": {},
|
213 | 226 | "nuid": "cfebdfdf-b87c-4a77-b97c-4697566a55fa",
|
214 | 227 | "showTitle": false,
|
|
255 | 268 | {
|
256 | 269 | "cell_type": "code",
|
257 | 270 | "execution_count": null,
|
| 271 | + "metadata": { |
| 272 | + "application/vnd.databricks.v1+cell": { |
| 273 | + "cellMetadata": { |
| 274 | + "byteLimit": 2048000, |
| 275 | + "rowLimit": 10000 |
| 276 | + }, |
| 277 | + "inputWidgets": {}, |
| 278 | + "nuid": "0d1f2e9e-db40-41fd-a6b9-bb4757db08b0", |
| 279 | + "showTitle": false, |
| 280 | + "title": "" |
| 281 | + } |
| 282 | + }, |
| 283 | + "outputs": [], |
| 284 | + "source": [ |
| 285 | + "# Make sure you have write access to the ``home`` directory\n", |
| 286 | + "home = os.path.join('/local_disk0', 'ift')\n", |
| 287 | + "os.makedirs(home, exist_ok=True)\n", |
| 288 | + "os.chdir(home)" |
| 289 | + ] |
| 290 | + }, |
| 291 | + { |
| 292 | + "cell_type": "code", |
| 293 | + "execution_count": 0, |
258 | 294 | "metadata": {
|
259 | 295 | "application/vnd.databricks.v1+cell": {
|
260 | 296 | "cellMetadata": {
|
|
271 | 307 | "source": [
|
272 | 308 | "FT_API_args = Namespace(\n",
|
273 | 309 | " model= 'mosaicml/mpt-7b', # Other examples: 'EleutherAI/gpt-neox-20b',\n",
|
274 |
| - " train_data_path= 'main.streaming.random_large_table', # Other examples: 'tatsu-lab/alpaca/train', # '/Volumes/main/mosaic_hackathon/managed-volume/IFT/train.jsonl' # 'mosaicml/dolly_hhrlhf/train'\n", |
| 310 | + " train_data_path= 'mosaicml/dolly_hhrlhf/train', # Other examples: '/path/to/train.jsonl', 'catalog.schema.table'\n", |
275 | 311 | " task_type='INSTRUCTION_FINETUNE',\n",
|
276 | 312 | " training_duration=3,\n",
|
277 | 313 | " context_length=2048,\n",
|
278 | 314 | ")\n",
|
279 | 315 | "\n",
|
280 |
| - "temporary_jsonl_data_path = '/Volumes/main/mosaic_hackathon/managed-volume/IFT/ft_data_11Jan24_3/train'\n", |
281 |
| - "os.environ['HF_DATASETS_CACHE'] = '/tmp/'\n", |
282 |
| - "os.makedirs(temporary_jsonl_data_path, exist_ok=True)" |
| 316 | + "temporary_jsonl_data_path = os.path.join(home, 'ft_data_11Jan24_3/train')\n", |
| 317 | + "os.environ['HF_DATASETS_CACHE'] = os.path.join(home, 'hf_cache')\n", |
| 318 | + "os.makedirs(temporary_jsonl_data_path, exist_ok=True)\n", |
| 319 | + "os.makedirs(os.environ['HF_DATASETS_CACHE'], exist_ok=True)" |
283 | 320 | ]
|
284 | 321 | },
|
285 | 322 | {
|
286 | 323 | "cell_type": "markdown",
|
287 | 324 | "metadata": {
|
288 | 325 | "application/vnd.databricks.v1+cell": {
|
289 |
| - "cellMetadata": {}, |
| 326 | + "cellMetadata": { |
| 327 | + "byteLimit": 2048000, |
| 328 | + "rowLimit": 10000 |
| 329 | + }, |
290 | 330 | "inputWidgets": {},
|
291 | 331 | "nuid": "39c45005-1a77-4162-b9e4-bd8df6f5ec69",
|
292 | 332 | "showTitle": false,
|
|
362 | 402 | "cell_type": "markdown",
|
363 | 403 | "metadata": {
|
364 | 404 | "application/vnd.databricks.v1+cell": {
|
365 |
| - "cellMetadata": {}, |
| 405 | + "cellMetadata": { |
| 406 | + "byteLimit": 2048000, |
| 407 | + "rowLimit": 10000 |
| 408 | + }, |
366 | 409 | "inputWidgets": {},
|
367 | 410 | "nuid": "06d46367-bd32-473a-9f16-1b34a8dd9356",
|
368 | 411 | "showTitle": false,
|
|
377 | 420 | "cell_type": "markdown",
|
378 | 421 | "metadata": {
|
379 | 422 | "application/vnd.databricks.v1+cell": {
|
380 |
| - "cellMetadata": {}, |
| 423 | + "cellMetadata": { |
| 424 | + "byteLimit": 2048000, |
| 425 | + "rowLimit": 10000 |
| 426 | + }, |
381 | 427 | "inputWidgets": {},
|
382 | 428 | "nuid": "1a28320a-a2a1-4f3c-a0cd-ad6045a24f64",
|
383 | 429 | "showTitle": false,
|
|
467 | 513 | "cell_type": "markdown",
|
468 | 514 | "metadata": {
|
469 | 515 | "application/vnd.databricks.v1+cell": {
|
470 |
| - "cellMetadata": {}, |
| 516 | + "cellMetadata": { |
| 517 | + "byteLimit": 2048000, |
| 518 | + "rowLimit": 10000 |
| 519 | + }, |
471 | 520 | "inputWidgets": {},
|
472 | 521 | "nuid": "9713a0ce-80f4-4187-b10b-4223b17fe4c1",
|
473 | 522 | "showTitle": false,
|
|
506 | 555 | "cell_type": "markdown",
|
507 | 556 | "metadata": {
|
508 | 557 | "application/vnd.databricks.v1+cell": {
|
509 |
| - "cellMetadata": {}, |
| 558 | + "cellMetadata": { |
| 559 | + "byteLimit": 2048000, |
| 560 | + "rowLimit": 10000 |
| 561 | + }, |
510 | 562 | "inputWidgets": {},
|
511 | 563 | "nuid": "7249e9e6-1ea7-4fc9-8959-8a17d62a9fb4",
|
512 | 564 | "showTitle": false,
|
|
547 | 599 | "cell_type": "markdown",
|
548 | 600 | "metadata": {
|
549 | 601 | "application/vnd.databricks.v1+cell": {
|
550 |
| - "cellMetadata": {}, |
| 602 | + "cellMetadata": { |
| 603 | + "byteLimit": 2048000, |
| 604 | + "rowLimit": 10000 |
| 605 | + }, |
551 | 606 | "inputWidgets": {},
|
552 | 607 | "nuid": "6699f47f-9b53-47da-95c0-b862c5826d0a",
|
553 | 608 | "showTitle": false,
|
|
562 | 617 | "cell_type": "markdown",
|
563 | 618 | "metadata": {
|
564 | 619 | "application/vnd.databricks.v1+cell": {
|
565 |
| - "cellMetadata": {}, |
| 620 | + "cellMetadata": { |
| 621 | + "byteLimit": 2048000, |
| 622 | + "rowLimit": 10000 |
| 623 | + }, |
566 | 624 | "inputWidgets": {},
|
567 | 625 | "nuid": "dd37fdce-62d0-493e-bfa9-d823634b2a0d",
|
568 | 626 | "showTitle": false,
|
|
624 | 682 | "source": [
|
625 | 683 | "FT_API_args = Namespace(\n",
|
626 | 684 | " model= 'mosaicml/mpt-7b',\n",
|
627 |
| - " train_data_path= '/Volumes/main/mosaic_hackathon/managed-volume/ABT',\n", |
| 685 | + " train_data_path= os.path.join(home, 'ABT'), # this is the path to your collection of txt files\n", |
628 | 686 | " task_type='CONTINUED_PRETRAIN',\n",
|
629 | 687 | " training_duration=3,\n",
|
630 |
| - " context_length=2048,\n", |
| 688 | + " context_length=8,\n", |
631 | 689 | ")\n",
|
632 |
| - "temporary_mds_output_path = '/Volumes/main/mosaic_hackathon/managed-volume/{your_username}/mds_data_11Jan24_5'" |
| 690 | + "temporary_mds_output_path = os.path.join(home, 'mds_data_11Jan24_5')" |
| 691 | + ] |
| 692 | + }, |
| 693 | + { |
| 694 | + "cell_type": "markdown", |
| 695 | + "metadata": { |
| 696 | + "application/vnd.databricks.v1+cell": { |
| 697 | + "cellMetadata": { |
| 698 | + "byteLimit": 2048000, |
| 699 | + "rowLimit": 10000 |
| 700 | + }, |
| 701 | + "inputWidgets": {}, |
| 702 | + "nuid": "fc2e4e8b-7700-47c4-bb21-ae4c389f39a2", |
| 703 | + "showTitle": false, |
| 704 | + "title": "" |
| 705 | + } |
| 706 | + }, |
| 707 | + "source": [ |
| 708 | + "Generate a synthetic dataset. Replace train_data_path with your raw data path in practice." |
| 709 | + ] |
| 710 | + }, |
| 711 | + { |
| 712 | + "cell_type": "code", |
| 713 | + "execution_count": 0, |
| 714 | + "metadata": { |
| 715 | + "application/vnd.databricks.v1+cell": { |
| 716 | + "cellMetadata": { |
| 717 | + "byteLimit": 2048000, |
| 718 | + "rowLimit": 10000 |
| 719 | + }, |
| 720 | + "inputWidgets": {}, |
| 721 | + "nuid": "10f08422-5091-4e64-b3f7-54928584cd60", |
| 722 | + "showTitle": false, |
| 723 | + "title": "" |
| 724 | + } |
| 725 | + }, |
| 726 | + "outputs": [], |
| 727 | + "source": [ |
| 728 | + "def generate_synthetic_dataset(folder_path, num_files=128):\n", |
| 729 | + " \"\"\"Generate a synthetic dataset of text files with random words.\"\"\"\n", |
| 730 | + " def generate_random_words(num_words=50):\n", |
| 731 | + " words = [\"apple\", \"banana\", \"cherry\", \"date\", \"elderberry\", \"fig\", \"grape\", \"honeydew\", \"kiwi\", \"lemon\", \"mango\", \"nectarine\", \"orange\", \"papaya\", \"quince\", \"raspberry\", \"strawberry\", \"tangerine\", \"ugli\", \"vanilla\", \"watermelon\", \"xigua\", \"yam\", \"zucchini\"]\n", |
| 732 | + " return ' '.join(random.choice(words) for _ in range(num_words))\n", |
| 733 | + "\n", |
| 734 | + " if not os.path.exists(folder_path):\n", |
| 735 | + " os.makedirs(folder_path)\n", |
| 736 | + " \n", |
| 737 | + " for i in range(num_files):\n", |
| 738 | + " file_path = os.path.join(folder_path, f\"file_{i}.txt\")\n", |
| 739 | + " with open(file_path, 'w') as file:\n", |
| 740 | + " file.write(generate_random_words())\n", |
| 741 | + "\n", |
| 742 | + " print(f\"Generated {num_files} files in '{folder_path}'.\")\n", |
| 743 | + "\n", |
| 744 | + "generate_synthetic_dataset(FT_API_args.train_data_path)" |
633 | 745 | ]
|
634 | 746 | },
|
635 | 747 | {
|
|
656 | 768 | "cell_type": "markdown",
|
657 | 769 | "metadata": {
|
658 | 770 | "application/vnd.databricks.v1+cell": {
|
659 |
| - "cellMetadata": {}, |
| 771 | + "cellMetadata": { |
| 772 | + "byteLimit": 2048000, |
| 773 | + "rowLimit": 10000 |
| 774 | + }, |
660 | 775 | "inputWidgets": {},
|
661 | 776 | "nuid": "c21e7d1b-db34-4e5d-b6d9-190dc75170d3",
|
662 | 777 | "showTitle": false,
|
|
688 | 803 | {
|
689 | 804 | "cell_type": "code",
|
690 | 805 | "execution_count": null,
|
| 806 | + "metadata": { |
| 807 | + "application/vnd.databricks.v1+cell": { |
| 808 | + "cellMetadata": { |
| 809 | + "byteLimit": 2048000, |
| 810 | + "rowLimit": 10000 |
| 811 | + }, |
| 812 | + "inputWidgets": {}, |
| 813 | + "nuid": "f5aea2a8-db29-40c9-8ed2-b6a1d032e7ab", |
| 814 | + "showTitle": false, |
| 815 | + "title": "" |
| 816 | + } |
| 817 | + }, |
| 818 | + "outputs": [], |
| 819 | + "source": [ |
| 820 | + "import os\n", |
| 821 | + "os.makedirs(temporary_mds_output_path, exist_ok=True)" |
| 822 | + ] |
| 823 | + }, |
| 824 | + { |
| 825 | + "cell_type": "code", |
| 826 | + "execution_count": 0, |
691 | 827 | "metadata": {
|
692 | 828 | "application/vnd.databricks.v1+cell": {
|
693 | 829 | "cellMetadata": {
|
|
734 | 870 | "cell_type": "markdown",
|
735 | 871 | "metadata": {
|
736 | 872 | "application/vnd.databricks.v1+cell": {
|
737 |
| - "cellMetadata": {}, |
| 873 | + "cellMetadata": { |
| 874 | + "byteLimit": 2048000, |
| 875 | + "rowLimit": 10000 |
| 876 | + }, |
738 | 877 | "inputWidgets": {},
|
739 | 878 | "nuid": "298eb990-9160-4e1b-958f-33dd2c11b54b",
|
740 | 879 | "showTitle": false,
|
|
776 | 915 | "execution_count": null,
|
777 | 916 | "metadata": {
|
778 | 917 | "application/vnd.databricks.v1+cell": {
|
779 |
| - "cellMetadata": {}, |
| 918 | + "cellMetadata": { |
| 919 | + "byteLimit": 2048000, |
| 920 | + "rowLimit": 10000 |
| 921 | + }, |
780 | 922 | "inputWidgets": {},
|
781 | 923 | "nuid": "e123669c-2f77-4d66-93eb-04efd546f39f",
|
782 | 924 | "showTitle": false,
|
|
0 commit comments