Skip to content

Commit bb572a2

Browse files
StanGirardAmineDirojacopo-chevallardchloedia
authored
docs(core): init (#3365)
# Description Please include a summary of the changes and the related issue. Please also include relevant motivation and context. ## Checklist before requesting a review Please delete options that are not relevant. - [ ] My code follows the style guidelines of this project - [ ] I have performed a self-review of my code - [ ] I have commented hard-to-understand areas - [ ] I have ideally added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] Any dependent changes have been merged ## Screenshots (if appropriate): --------- Co-authored-by: aminediro <[email protected]> Co-authored-by: Jacopo Chevallard <[email protected]> Co-authored-by: chloedia <[email protected]> Co-authored-by: AmineDiro <[email protected]>
1 parent 6c2858f commit bb572a2

File tree

19 files changed

+743
-43
lines changed

19 files changed

+743
-43
lines changed

backend/core/quivr_core/base_config.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,34 @@
55

66

77
class QuivrBaseConfig(BaseModel):
8+
"""
9+
Base configuration class for Quivr.
10+
11+
This class extends Pydantic's BaseModel and provides a foundation for
12+
configuration management in quivr-core.
13+
14+
Attributes:
15+
model_config (ConfigDict): Configuration for the Pydantic model.
16+
It's set to forbid extra attributes, ensuring strict adherence
17+
to the defined schema.
18+
19+
Class Methods:
20+
from_yaml: Create an instance of the class from a YAML file.
21+
"""
22+
823
model_config = ConfigDict(extra="forbid")
924

1025
@classmethod
1126
def from_yaml(cls, file_path: str | Path):
27+
"""
28+
Create an instance of the class from a YAML file.
29+
30+
Args:
31+
file_path (str | Path): The path to the YAML file.
32+
33+
Returns:
34+
QuivrBaseConfig: An instance of the class initialized with the data from the YAML file.
35+
"""
1236
# Load the YAML file
1337
with open(file_path, "r") as stream:
1438
config_data = yaml.safe_load(stream)

backend/core/quivr_core/brain/brain.py

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,24 @@
4646
async def process_files(
4747
storage: StorageBase, skip_file_error: bool, **processor_kwargs: dict[str, Any]
4848
) -> list[Document]:
49+
"""
50+
Process files in storage.
51+
This function takes a StorageBase and return a list of langchain documents.
52+
53+
Args:
54+
storage (StorageBase): The storage containing the files to process.
55+
skip_file_error (bool): Whether to skip files that cannot be processed.
56+
processor_kwargs (dict[str, Any]): Additional arguments for the processor.
57+
58+
Returns:
59+
list[Document]: List of processed documents in the Langchain Document format.
60+
61+
Raises:
62+
ValueError: If a file cannot be processed and skip_file_error is False.
63+
Exception: If no processor is found for a file of a specific type and skip_file_error is False.
64+
65+
"""
66+
4967
knowledge = []
5068
for file in await storage.get_files():
5169
try:
@@ -71,6 +89,36 @@ async def process_files(
7189

7290

7391
class Brain:
92+
"""
93+
A class representing a Brain.
94+
95+
This class allows for the creation of a Brain, which is a collection of knowledge one wants to retrieve information from.
96+
97+
A Brain is set to:
98+
99+
* Store files in the storage of your choice (local, S3, etc.)
100+
* Process the files in the storage to extract text and metadata in a wide range of format.
101+
* Store the processed files in the vector store of your choice (FAISS, PGVector, etc.) - default to FAISS.
102+
* Create an index of the processed files.
103+
* Use the *Quivr* workflow for the retrieval augmented generation.
104+
105+
A Brain is able to:
106+
107+
* Search for information in the vector store.
108+
* Answer questions about the knowledges in the Brain.
109+
* Stream the answer to the question.
110+
111+
Attributes:
112+
name (str): The name of the brain.
113+
id (UUID): The unique identifier of the brain.
114+
storage (StorageBase): The storage used to store the files.
115+
llm (LLMEndpoint): The language model used to generate the answer.
116+
vector_db (VectorStore): The vector store used to store the processed files.
117+
embedder (Embeddings): The embeddings used to create the index of the processed files.
118+
119+
120+
"""
121+
74122
def __init__(
75123
self,
76124
*,
@@ -106,6 +154,22 @@ def print_info(self):
106154

107155
@classmethod
108156
def load(cls, folder_path: str | Path) -> Self:
157+
"""
158+
Load a brain from a folder path.
159+
160+
Args:
161+
folder_path (str | Path): The path to the folder containing the brain.
162+
163+
Returns:
164+
Brain: The brain loaded from the folder path.
165+
166+
Example:
167+
```python
168+
brain_loaded = Brain.load("path/to/brain")
169+
brain_loaded.print_info()
170+
```
171+
172+
"""
109173
if isinstance(folder_path, str):
110174
folder_path = Path(folder_path)
111175
if not folder_path.exists():
@@ -154,6 +218,20 @@ def load(cls, folder_path: str | Path) -> Self:
154218
)
155219

156220
async def save(self, folder_path: str | Path):
221+
"""
222+
Save the brain to a folder path.
223+
224+
Args:
225+
folder_path (str | Path): The path to the folder where the brain will be saved.
226+
227+
Returns:
228+
str: The path to the folder where the brain was saved.
229+
230+
Example:
231+
```python
232+
await brain.save("path/to/brain")
233+
```
234+
"""
157235
if isinstance(folder_path, str):
158236
folder_path = Path(folder_path)
159237

@@ -247,6 +325,28 @@ async def afrom_files(
247325
skip_file_error: bool = False,
248326
processor_kwargs: dict[str, Any] | None = None,
249327
):
328+
"""
329+
Create a brain from a list of file paths.
330+
331+
Args:
332+
name (str): The name of the brain.
333+
file_paths (list[str | Path]): The list of file paths to add to the brain.
334+
vector_db (VectorStore | None): The vector store used to store the processed files.
335+
storage (StorageBase): The storage used to store the files.
336+
llm (LLMEndpoint | None): The language model used to generate the answer.
337+
embedder (Embeddings | None): The embeddings used to create the index of the processed files.
338+
skip_file_error (bool): Whether to skip files that cannot be processed.
339+
processor_kwargs (dict[str, Any] | None): Additional arguments for the processor.
340+
341+
Returns:
342+
Brain: The brain created from the file paths.
343+
344+
Example:
345+
```python
346+
brain = await Brain.afrom_files(name="My Brain", file_paths=["file1.pdf", "file2.pdf"])
347+
brain.print_info()
348+
```
349+
"""
250350
if llm is None:
251351
llm = default_llm()
252352

@@ -327,6 +427,28 @@ async def afrom_langchain_documents(
327427
llm: LLMEndpoint | None = None,
328428
embedder: Embeddings | None = None,
329429
) -> Self:
430+
"""
431+
Create a brain from a list of langchain documents.
432+
433+
Args:
434+
name (str): The name of the brain.
435+
langchain_documents (list[Document]): The list of langchain documents to add to the brain.
436+
vector_db (VectorStore | None): The vector store used to store the processed files.
437+
storage (StorageBase): The storage used to store the files.
438+
llm (LLMEndpoint | None): The language model used to generate the answer.
439+
embedder (Embeddings | None): The embeddings used to create the index of the processed files.
440+
441+
Returns:
442+
Brain: The brain created from the langchain documents.
443+
444+
Example:
445+
```python
446+
from langchain_core.documents import Document
447+
documents = [Document(page_content="Hello, world!")]
448+
brain = await Brain.afrom_langchain_documents(name="My Brain", langchain_documents=documents)
449+
brain.print_info()
450+
```
451+
"""
330452
if llm is None:
331453
llm = default_llm()
332454

@@ -357,6 +479,26 @@ async def asearch(
357479
filter: Callable | Dict[str, Any] | None = None,
358480
fetch_n_neighbors: int = 20,
359481
) -> list[SearchResult]:
482+
"""
483+
Search for relevant documents in the brain based on a query.
484+
485+
Args:
486+
query (str | Document): The query to search for.
487+
n_results (int): The number of results to return.
488+
filter (Callable | Dict[str, Any] | None): The filter to apply to the search.
489+
fetch_n_neighbors (int): The number of neighbors to fetch.
490+
491+
Returns:
492+
list[SearchResult]: The list of retrieved chunks.
493+
494+
Example:
495+
```python
496+
brain = Brain.from_files(name="My Brain", file_paths=["file1.pdf", "file2.pdf"])
497+
results = await brain.asearch("Why everybody loves Quivr?")
498+
for result in results:
499+
print(result.chunk.page_content)
500+
```
501+
"""
360502
if not self.vector_db:
361503
raise ValueError("No vector db configured for this brain")
362504

@@ -383,6 +525,26 @@ def ask(
383525
list_files: list[QuivrKnowledge] | None = None,
384526
chat_history: ChatHistory | None = None,
385527
) -> ParsedRAGResponse:
528+
"""
529+
Ask a question to the brain and get a generated answer.
530+
531+
Args:
532+
question (str): The question to ask.
533+
retrieval_config (RetrievalConfig | None): The retrieval configuration (see RetrievalConfig docs).
534+
rag_pipeline (Type[Union[QuivrQARAG, QuivrQARAGLangGraph]] | None): The RAG pipeline to use.
535+
list_files (list[QuivrKnowledge] | None): The list of files to include in the RAG pipeline.
536+
chat_history (ChatHistory | None): The chat history to use.
537+
538+
Returns:
539+
ParsedRAGResponse: The generated answer.
540+
541+
Example:
542+
```python
543+
brain = Brain.from_files(name="My Brain", file_paths=["file1.pdf", "file2.pdf"])
544+
answer = brain.ask("What is the meaning of life?")
545+
print(answer.answer)
546+
```
547+
"""
386548
llm = self.llm
387549

388550
# If you passed a different llm model we'll override the brain one
@@ -420,6 +582,27 @@ async def ask_streaming(
420582
list_files: list[QuivrKnowledge] | None = None,
421583
chat_history: ChatHistory | None = None,
422584
) -> AsyncGenerator[ParsedRAGChunkResponse, ParsedRAGChunkResponse]:
585+
"""
586+
Ask a question to the brain and get a streamed generated answer.
587+
588+
Args:
589+
question (str): The question to ask.
590+
retrieval_config (RetrievalConfig | None): The retrieval configuration (see RetrievalConfig docs).
591+
rag_pipeline (Type[Union[QuivrQARAG, QuivrQARAGLangGraph]] | None): The RAG pipeline to use.
592+
list_files (list[QuivrKnowledge] | None): The list of files to include in the RAG pipeline.
593+
chat_history (ChatHistory | None): The chat history to use.
594+
595+
Returns:
596+
AsyncGenerator[ParsedRAGChunkResponse, ParsedRAGChunkResponse]: The streamed generated answer.
597+
598+
Example:
599+
```python
600+
brain = Brain.from_files(name="My Brain", file_paths=["file1.pdf", "file2.pdf"])
601+
async for chunk in brain.ask_streaming("What is the meaning of life?"):
602+
print(chunk.answer)
603+
```
604+
605+
"""
423606
llm = self.llm
424607

425608
# If you passed a different llm model we'll override the brain one

backend/core/quivr_core/chat.py

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,35 @@
1010

1111
class ChatHistory:
1212
"""
13-
Chat history is a list of ChatMessage.
14-
It is used to store the chat history of a chat.
13+
ChatHistory is a class that maintains a record of chat conversations. Each message
14+
in the history is represented by an instance of the `ChatMessage` class, and the
15+
chat history is stored internally as a list of these `ChatMessage` objects.
16+
The class provides methods to retrieve, append, iterate, and manipulate the chat
17+
history, as well as utilities to convert the messages into specific formats
18+
and support deep copying.
1519
"""
1620

1721
def __init__(self, chat_id: UUID, brain_id: UUID | None) -> None:
22+
"""Init a new ChatHistory object.
23+
24+
Args:
25+
chat_id (UUID): A unique identifier for the chat session.
26+
brain_id (UUID | None): An optional identifier for the brain associated with the chat.
27+
"""
1828
self.id = chat_id
1929
self.brain_id = brain_id
2030
# TODO(@aminediro): maybe use a deque() instead ?
2131
self._msgs: list[ChatMessage] = []
2232

2333
def get_chat_history(self, newest_first: bool = False):
24-
"""Returns a ChatMessage list sorted by time
34+
"""
35+
Retrieves the chat history, optionally sorted in reverse chronological order.
36+
37+
Args:
38+
newest_first (bool, optional): If True, returns the messages in reverse order (newest first). Defaults to False.
2539
2640
Returns:
27-
list[ChatMessage]: list of chat messages
41+
List[ChatMessage]: A sorted list of chat messages.
2842
"""
2943
history = sorted(self._msgs, key=lambda msg: msg.message_time)
3044
if newest_first:
@@ -38,7 +52,11 @@ def append(
3852
self, langchain_msg: AIMessage | HumanMessage, metadata: dict[str, Any] = {}
3953
):
4054
"""
41-
Append a message to the chat history.
55+
Appends a new message to the chat history.
56+
57+
Args:
58+
langchain_msg (AIMessage | HumanMessage): The message content (either an AI or Human message).
59+
metadata (dict[str, Any], optional): Additional metadata related to the message. Defaults to an empty dictionary.
4260
"""
4361
chat_msg = ChatMessage(
4462
chat_id=self.id,
@@ -52,7 +70,13 @@ def append(
5270

5371
def iter_pairs(self) -> Generator[Tuple[HumanMessage, AIMessage], None, None]:
5472
"""
55-
Iterate over the chat history as pairs of HumanMessage and AIMessage.
73+
Iterates over the chat history in pairs, returning a HumanMessage followed by an AIMessage.
74+
75+
Yields:
76+
Tuple[HumanMessage, AIMessage]: Pairs of human and AI messages.
77+
78+
Raises:
79+
AssertionError: If the messages in the pair are not in the expected order (i.e., a HumanMessage followed by an AIMessage).
5680
"""
5781
# Reverse the chat_history, newest first
5882
it = iter(self.get_chat_history(newest_first=True))
@@ -66,7 +90,13 @@ def iter_pairs(self) -> Generator[Tuple[HumanMessage, AIMessage], None, None]:
6690
yield (human_message.msg, ai_message.msg)
6791

6892
def to_list(self) -> List[HumanMessage | AIMessage]:
69-
"""Format the chat history into a list of HumanMessage and AIMessage"""
93+
"""
94+
Converts the chat history into a list of raw HumanMessage or AIMessage objects.
95+
96+
Returns:
97+
list[HumanMessage | AIMessage]: A list of messages in their raw form, without metadata.
98+
"""
99+
70100
return [_msg.msg for _msg in self._msgs]
71101

72102
def __deepcopy__(self, memo):

0 commit comments

Comments
 (0)