|
1 | | -async_mode: threaded |
2 | | -cache: |
3 | | - base_dir: cache |
4 | | - type: file |
5 | | -chunks: |
6 | | - group_by_columns: |
7 | | - - id |
8 | | - overlap: 64 |
9 | | - size: 512 |
10 | | -claim_extraction: |
11 | | - description: Any claims or facts that could be relevant to information discovery. |
12 | | - max_gleanings: 0 |
13 | | - prompt: prompts/claim_extraction.txt |
14 | | -cluster_graph: |
15 | | - max_cluster_size: 10 |
16 | | -community_report: |
17 | | - max_input_length: 8000 |
18 | | - max_length: 2000 |
19 | | - prompt: prompts/community_report.txt |
20 | | -embed_graph: |
21 | | - enabled: false |
| 1 | + |
| 2 | +encoding_model: cl100k_base |
| 3 | +skip_workflows: [] |
| 4 | +llm: |
| 5 | + api_key: ${GRAPHRAG_API_KEY} |
| 6 | + type: openai_chat # or azure_openai_chat |
| 7 | + model: mistral-nemo:12b-instruct-2407-fp16 |
| 8 | + model_supports_json: true # recommended if this is available for your model. |
| 9 | + max_tokens: 8192 |
| 10 | + # request_timeout: 180.0 |
| 11 | + api_base: http://localhost:11434/v1 |
| 12 | + # api_version: 2024-02-15-preview |
| 13 | + # organization: <organization_id> |
| 14 | + # deployment_name: <azure_model_deployment_name> |
| 15 | + # tokens_per_minute: 150_000 # set a leaky bucket throttle |
| 16 | + # requests_per_minute: 10_000 # set a leaky bucket throttle |
| 17 | + max_retries: 3 |
| 18 | + # max_retry_wait: 10.0 |
| 19 | + # sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times |
| 20 | + concurrent_requests: 25 # the number of parallel inflight requests that may be made |
| 21 | + |
| 22 | +parallelization: |
| 23 | + stagger: 0.3 |
| 24 | + num_threads: 50 # the number of threads to use for parallel processing |
| 25 | + |
| 26 | +async_mode: threaded # or asyncio |
| 27 | + |
22 | 28 | embeddings: |
23 | | - async_mode: threaded |
| 29 | + ## parallelization: override the global parallelization settings for embeddings |
| 30 | + async_mode: threaded # or asyncio |
24 | 31 | llm: |
25 | | - api_base: http://localhost:11434/api |
26 | 32 | api_key: ${GRAPHRAG_API_KEY} |
27 | | - concurrent_requests: 25 |
| 33 | + type: openai_embedding # or azure_openai_embedding |
28 | 34 | model: nomic-embed-text:latest |
29 | | - model_supports_json: true |
30 | | - provider: openai_embedding |
31 | | - type: openai_embedding |
32 | | -encoding_model: cl100k_base |
33 | | -entity_extraction: |
34 | | - entity_types: |
35 | | - - organization |
36 | | - - person |
37 | | - - geo |
38 | | - - event |
39 | | - max_gleanings: 0 |
40 | | - prompt: prompts/entity_extraction.txt |
41 | | -global_search: |
42 | | - concurrency: 32 |
| 35 | + api_base: http://localhost:11434/api |
| 36 | + # api_version: 2024-02-15-preview |
| 37 | + # organization: <organization_id> |
| 38 | + # deployment_name: <azure_model_deployment_name> |
| 39 | + # tokens_per_minute: 150_000 # set a leaky bucket throttle |
| 40 | + # requests_per_minute: 10_000 # set a leaky bucket throttle |
| 41 | + max_retries: 3 |
| 42 | + # max_retry_wait: 10.0 |
| 43 | + # sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times |
| 44 | + concurrent_requests: 25 # the number of parallel inflight requests that may be made |
| 45 | + #batch_size: 1 # the number of documents to send in a single request |
| 46 | + #batch_max_tokens: 4000 # the maximum number of tokens to send in a single request |
| 47 | + # target: required # or optional |
| 48 | + |
| 49 | + |
| 50 | + |
| 51 | +chunks: |
| 52 | + size: 512 |
| 53 | + overlap: 64 |
| 54 | + group_by_columns: [id] # by default, we don't allow chunks to cross documents |
| 55 | + |
43 | 56 | input: |
44 | | - base_dir: input |
| 57 | + type: file # or blob |
| 58 | + file_type: text # or csv |
| 59 | + base_dir: "input" |
45 | 60 | file_encoding: utf-8 |
46 | | - file_pattern: .*\.txt$ |
47 | | - file_type: text |
48 | | - type: file |
49 | | -llm: |
50 | | - api_base: http://localhost:11434/v1 |
51 | | - api_key: ${GRAPHRAG_API_KEY} |
52 | | - concurrent_requests: 25 |
53 | | - max_tokens: 1024 |
54 | | - model: qwen2:7b |
55 | | - model_supports_json: true |
56 | | - provider: openai_chat |
57 | | - temperature: 0.5 |
58 | | - type: openai |
59 | | -local_search: null |
60 | | -parallelization: |
61 | | - num_threads: 50 |
62 | | - stagger: 0.3 |
63 | | -reporting: |
64 | | - base_dir: output/${timestamp}/reports |
65 | | - type: file |
66 | | -skip_workflows: [] |
67 | | -snapshots: |
68 | | - graphml: true |
69 | | - raw_entities: true |
70 | | - top_level_nodes: true |
| 61 | + file_pattern: ".*\\.txt$" |
| 62 | + |
| 63 | +cache: |
| 64 | + type: file # or blob |
| 65 | + base_dir: "cache" |
| 66 | + # connection_string: <azure_blob_storage_connection_string> |
| 67 | + # container_name: <azure_blob_storage_container_name> |
| 68 | + |
71 | 69 | storage: |
72 | | - base_dir: output/${timestamp}/artifacts |
73 | | - type: file |
| 70 | + type: file # or blob |
| 71 | + base_dir: "output/${timestamp}/artifacts" |
| 72 | + # connection_string: <azure_blob_storage_connection_string> |
| 73 | + # container_name: <azure_blob_storage_container_name> |
| 74 | + |
| 75 | +reporting: |
| 76 | + type: file # or console, blob |
| 77 | + base_dir: "output/${timestamp}/reports" |
| 78 | + # connection_string: <azure_blob_storage_connection_string> |
| 79 | + # container_name: <azure_blob_storage_container_name> |
| 80 | + |
| 81 | +entity_extraction: |
| 82 | + ## llm: override the global llm settings for this task |
| 83 | + ## parallelization: override the global parallelization settings for this task |
| 84 | + ## async_mode: override the global async_mode settings for this task |
| 85 | + prompt: "prompts/entity_extraction.txt" |
| 86 | + entity_types: [organization,person,geo,event] |
| 87 | + max_gleanings: 0 |
| 88 | + |
74 | 89 | summarize_descriptions: |
| 90 | + ## llm: override the global llm settings for this task |
| 91 | + ## parallelization: override the global parallelization settings for this task |
| 92 | + ## async_mode: override the global async_mode settings for this task |
| 93 | + prompt: "prompts/summarize_descriptions.txt" |
75 | 94 | max_length: 500 |
76 | | - prompt: prompts/summarize_descriptions.txt |
| 95 | + |
| 96 | +claim_extraction: |
| 97 | + ## llm: override the global llm settings for this task |
| 98 | + ## parallelization: override the global parallelization settings for this task |
| 99 | + ## async_mode: override the global async_mode settings for this task |
| 100 | + # enabled: true |
| 101 | + prompt: "prompts/claim_extraction.txt" |
| 102 | + description: "Any claims or facts that could be relevant to information discovery." |
| 103 | + max_gleanings: 0 |
| 104 | + |
| 105 | +community_reports: |
| 106 | + ## llm: override the global llm settings for this task |
| 107 | + ## parallelization: override the global parallelization settings for this task |
| 108 | + ## async_mode: override the global async_mode settings for this task |
| 109 | + prompt: "prompts/community_report.txt" |
| 110 | + max_length: 2000 |
| 111 | + max_input_length: 4000 |
| 112 | + |
| 113 | +cluster_graph: |
| 114 | + max_cluster_size: 10 |
| 115 | + |
| 116 | +embed_graph: |
| 117 | + enabled: false # if true, will generate node2vec embeddings for nodes |
| 118 | + # num_walks: 10 |
| 119 | + # walk_length: 40 |
| 120 | + # window_size: 2 |
| 121 | + # iterations: 3 |
| 122 | + # random_seed: 597832 |
| 123 | + |
77 | 124 | umap: |
78 | | - enabled: false |
| 125 | + enabled: false # if true, will generate UMAP embeddings for nodes |
| 126 | + |
| 127 | +snapshots: |
| 128 | + graphml: false |
| 129 | + raw_entities: false |
| 130 | + top_level_nodes: false |
| 131 | + |
| 132 | +local_search: |
| 133 | + # text_unit_prop: 0.5 |
| 134 | + # community_prop: 0.1 |
| 135 | + # conversation_history_max_turns: 5 |
| 136 | + # top_k_mapped_entities: 10 |
| 137 | + # top_k_relationships: 10 |
| 138 | + # max_tokens: 12000 |
| 139 | + |
| 140 | +global_search: |
| 141 | + # max_tokens: 12000 |
| 142 | + # data_max_tokens: 12000 |
| 143 | + # map_max_tokens: 1000 |
| 144 | + # reduce_max_tokens: 2000 |
| 145 | + # concurrency: 32 |
0 commit comments