ShishirPatil · ShishirPatil · Aug 27, 2024 · Aug 26, 2024 · Aug 27, 2024
diff --git a/raft/README.md b/raft/README.md
@@ -136,6 +136,13 @@ This details the command and process used to generate the example dataset found
 ```bash 
 python3 raft.py --datapath sample_data/United_States_PDF.pdf --output ./sample_ds4 --distractors 4 --doctype pdf --chunk_size 512 --questions 5 --openai_key OPENAI_KEY
 ```
+### Usage with Completely locally using hugging-face models
+
+This details the command and process used to generate the example dataset found in `./sample_ds4`. The document is a pdf of the Wikipedia page on the United States of America. 
+To run the script completely locally use 
+```
+python3 raft_local.py --datapath sample_data/UC_Berkeley_short.pdf --output ./sample_ds4 --chunk_size 512 --questions 5 --doctype pdf --fast
+```
 
 #### 1. Chunk generation
 RAFT takes pdf and divides text into chunks of size 512 tokens. A sample chunk:  

diff --git a/raft/raft_local.py b/raft/raft_local.py
@@ -0,0 +1,310 @@
+import logging
+from typing import Literal, Any
+import argparse
+import json
+import PyPDF2
+import random
+import os, shutil
+from math import ceil
+from datasets import Dataset, concatenate_datasets
+from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForQuestionAnswering
+import torch
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("huggingface_script")
+
+# Document type literals
+DocType = Literal["api", "pdf", "json", "txt"]
+
+# Every N chunks, save a checkpoint
+N = 15
+
+def get_args() -> argparse.Namespace:
+    """
+    Parses and returns the command line arguments specified by the user.
+    """
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--datapath", type=str, default="", help="The path at which the document is located")
+    parser.add_argument("--output", type=str, default="./", help="The path at which to save the dataset")
+    parser.add_argument("--output-format", type=str, default="hf", help="Format to convert the dataset to. Defaults to hf.")
+    parser.add_argument("--output-type", type=str, default="jsonl", help="Type to export the dataset to. Defaults to jsonl.")
+    parser.add_argument("--distractors", type=int, default=3, help="The number of distractor documents to include per data point / triplet")
+    parser.add_argument("--p", type=float, default=1.0, help="The percentage that the oracle document is included in the context")
+    parser.add_argument("--questions", type=int, default=5, help="The number of data points / triplets to generate per chunk")
+    parser.add_argument("--chunk_size", type=int, default=512, help="The size of each chunk in number of tokens")
+    parser.add_argument("--doctype", type=str, default="pdf", help="The type of the document", choices=["pdf", "txt", "json", "api"])
+    parser.add_argument("--fast", action="store_true", help="Run the script in fast mode (no recovery implemented)")
+
+    args = parser.parse_args()
+    return args
+
+def get_chunks(file_path: str, doctype: DocType = "pdf", chunk_size: int = 512) -> list[str]:
+    """
+    Takes in a `file_path` and `doctype`, retrieves the document, breaks it down into chunks of size
+    `chunk_size`, and returns the chunks as a list of strings.
+    """
+    chunks = []
+
+    logger.info(f"Retrieving chunks from {file_path} of type {doctype}")
+
+    if doctype == "api":
+        # Load API documentation and process it
+        with open(file_path) as f:
+            api_docs_json = json.load(f)
+        chunks = [str(api_doc_json) for api_doc_json in api_docs_json]
+
+    else:
+        if doctype == "json":
+            # Load JSON document
+            with open(file_path, 'r') as f:
+                data = json.load(f)
+            text = data["text"]
+        elif doctype == "pdf":
+            # Load PDF and extract text
+            text = ""
+            with open(file_path, 'rb') as file:
+                reader = PyPDF2.PdfReader(file)
+                num_pages = len(reader.pages)
+                for page_num in range(num_pages):
+                    page = reader.pages[page_num]
+                    text += page.extract_text()
+        elif doctype == "txt":
+            # Load plain text document
+            with open(file_path, 'r') as file:
+                text = file.read()
+        else:
+            raise TypeError("Document is not one of the accepted types: api, pdf, json, txt")
+
+        # Split the text into chunks
+        num_chunks = ceil(len(text) / chunk_size)
+        logger.info(f"Splitting text into {num_chunks} chunks.")
+        for i in range(0, len(text), chunk_size):
+            chunks.append(text[i:i + chunk_size])
+
+    return chunks
+
+# def generate_instructions_hf(chunk: str, x: int = 5, model_name: str = "t5-small") -> list[str]:
+#     """
+#     Uses a Hugging Face model to generate `x` questions based on the given text chunk.
+#     """
+#     # Load the Hugging Face model and tokenizer for question generation
+#     tokenizer = AutoTokenizer.from_pretrained(model_name)
+#     model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+
+#     input_text = f"Generate a question based on the following text: {chunk}"
+#     inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding="longest")
+
+#     questions = []
+#     for _ in range(x):
+#         output = model.generate(inputs.input_ids, max_length=64)
+#         question = tokenizer.decode(output[0], skip_special_tokens=True)
+#         questions.append(question)
+
+#     return questions
+
+
+# def generate_label_hf(question: str, context: str, model_name: str = "deepset/roberta-base-squad2") -> str:
+#     """
+#     Uses a Hugging Face model to generate an answer to the given question based on the context.
+#     """
+#     # Load the Hugging Face model and tokenizer for question-answering
+#     question_answering_pipeline = pipeline("question-answering", model=model_name)
+
+#     result = question_answering_pipeline(question=question, context=context)
+
+#     return result['answer']
+
+def generate_instructions_hf(chunk: str, x: int = 5, model_name: str = "t5-small") -> list[str]:
+    """
+    Uses a Hugging Face model to generate `x` questions based on the given text chunk, utilizing the GPU if available.
+    """
+    # Load the Hugging Face model and tokenizer for question generation
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+
+    # Move model to GPU if available
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+
+    input_text = f"Generate questions based on the following text: {chunk}"
+    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding="longest").to(device)
+
+    outputs = model.generate(
+        inputs.input_ids, 
+        max_length=64, 
+        num_beams=x,  # Using beam search with `x` beams
+        num_return_sequences=x  # Returning `x` sequences
+    )
+
+    questions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
+
+    return questions
+
+def generate_label_hf(question: str, context: str, model_name: str = "deepset/roberta-base-squad2") -> str:
+    """
+    Uses a Hugging Face model to generate an answer to the given question based on the context, utilizing the GPU if available.
+    """
+    # Load the Hugging Face model and tokenizer for question-answering
+    question_answering_pipeline = pipeline("question-answering", model=model_name, device=0 if torch.cuda.is_available() else -1)
+
+    result = question_answering_pipeline(question=question, context=context)
+
+    return result['answer']
+
+def add_chunk_to_dataset(
+    chunks: list[str], 
+    chunk: str, 
+    doctype: DocType = "api", 
+    x: int = 5, 
+    num_distract: int = 3, 
+    p: float = 0.8,
+    model_name_qg: str = "t5-small",
+    model_name_qa: str = "deepset/roberta-base-squad2"
+) -> None:
+    """
+    Given a chunk, create {Q, A, D} triplets and add them to the dataset using Hugging Face models.
+    """
+    global ds
+    i = chunks.index(chunk)
+
+    # Use the Hugging Face model to generate questions
+    qs = generate_instructions_hf(chunk, x, model_name=model_name_qg)
+    for q in qs:
+        datapt = {
+            "id": None,
+            "type": None,
+            "question": None,
+            "context": None,
+            "oracle_context": None,
+            "cot_answer": None
+        }
+
+        datapt["id"] = f"seed_task_{0 if not ds else ds.num_rows}"
+        datapt["type"] = "api call" if doctype == "api" else "general"
+        datapt["question"] = q
+
+        # Create distractor documents
+        docs = [chunk]
+        indices = list(range(0, len(chunks)))
+        indices.remove(i)
+        for j in random.sample(indices, num_distract):
+            docs.append(chunks[j])
+        # Decide whether to add oracle document
+        oracle = random.uniform(0, 1) < p
+        if not oracle:
+            docs[0] = chunks[random.sample(indices, 1)[0]]
+        random.shuffle(docs)
+
+        d = {
+            "title": ["placeholder_title"] * (num_distract + 1),
+            "sentences": docs
+        }
+        datapt["context"] = d
+        datapt["oracle_context"] = chunk
+
+        # Add the answer generated by the Hugging Face model
+        datapt["cot_answer"] = generate_label_hf(q, chunk, model_name=model_name_qa)
+
+        # Construct model instruction
+        context = ""
+        for doc in docs:
+            context += "<DOCUMENT>" + str(doc) + "</DOCUMENT>\n"
+        context += q
+        datapt["instruction"] = context
+
+        # Add to dataset
+        if not ds:
+            # Initialize dataset
+            datapt["id"] = [datapt["id"]]
+            datapt["type"] = [datapt["type"]]
+            datapt["question"] = [datapt["question"]]
+            datapt["context"] = [datapt["context"]]
+            datapt["oracle_context"] = [datapt["oracle_context"]]
+            datapt["cot_answer"] = [datapt["cot_answer"]]
+            datapt["instruction"] = [datapt["instruction"]]
+            ds = Dataset.from_dict(datapt)
+        else:
+            ds = ds.add_item(datapt)
+
+def save_checkpoint(state, filename):
+    """
+    Saves the current state of processing to a file for recovery.
+    """
+    with open(filename, 'w') as f:
+        f.write(str(state))
+
+def load_checkpoint(filename):
+    """
+    Loads the processing state from a checkpoint file.
+    """
+    with open(filename, 'r') as f:
+        return int(f.read())
+
+def main():
+    global ds
+
+    # Get command line arguments
+    args = get_args()
+
+    CHUNK_SIZE = args.chunk_size
+    NUM_DISTRACT_DOCS = args.distractors
+
+    # Split the document into chunks
+    chunks = get_chunks(args.datapath, args.doctype, CHUNK_SIZE)
+
+    ds = None
+
+    num_chunks = len(chunks)
+
+    if not args.fast:
+        start = 0
+        if os.path.exists("checkpoint.txt"):
+            start = int(load_checkpoint("checkpoint.txt"))
+
+        for i in range((start // N) * N, len(chunks)):
+            chunk = chunks[i]
+            save_checkpoint(i, "checkpoint.txt")
+
+            perc = ceil(i / num_chunks * 100)
+            logger.info(f"Adding chunk {i}/{num_chunks}")
+            add_chunk_to_dataset(chunks, chunk, args.doctype, args.questions, NUM_DISTRACT_DOCS)
+
+            if (i + 1) % N == 0:
+                ds.save_to_disk(args.output + "-checkpoints-" + str(i))
+                ds = None
+
+        if ds:
+            ds.save_to_disk(args.output + "-checkpoints-last")
+
+        ds_list = []
+
+        for filename in os.listdir(os.path.dirname(args.output)):
+            if "-checkpoints-" in filename:
+                for f in os.listdir(os.path.dirname(args.output) + "/" + filename):
+                    if f.endswith(".arrow"):
+                        ds_list.append(Dataset.from_file(os.path.dirname(args.output) + "/" + filename + "/" + f))
+
+        ds = concatenate_datasets(ds_list)
+    else:
+        for i, chunk in enumerate(chunks):
+            perc = ceil(i / num_chunks * 100)
+            logger.info(f"Adding chunk {i}/{num_chunks}")
+            add_chunk_to_dataset(chunks, chunk, args.doctype, args.questions, NUM_DISTRACT_DOCS)
+
+    # Save the final dataset
+    ds.save_to_disk(args.output)
+
+    # Save as .jsonl format (dummy functionality)
+    # Implement a conversion function if needed, this is just a placeholder
+    logger.info("Converting dataset to the desired format...")
+
+    if not args.fast:
+        os.remove("checkpoint.txt")
+        for filename in os.listdir(os.path.dirname(args.output)):
+            if "-checkpoints-" in filename:
+                shutil.rmtree(os.path.dirname(args.output) + "/" + filename)
+
+if __name__ == "__main__":
+    logger.info("Starting the Hugging Face processing script...")
+    main()
diff --git a/raft/requirements.txt b/raft/requirements.txt