Skip to content

Commit b287da4

Browse files
Merge pull request microsoft#1 from KylinMountain/master
Merge from upstream
2 parents 54ac679 + f5d3ecd commit b287da4

File tree

24 files changed

+858
-601
lines changed

24 files changed

+858
-601
lines changed

.semversioner/0.3.0.json

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
{
2+
"changes": [
3+
{
4+
"description": "Implement auto templating API.",
5+
"type": "minor"
6+
},
7+
{
8+
"description": "Implement query engine API.",
9+
"type": "minor"
10+
},
11+
{
12+
"description": "Fix file dumps using json for non ASCII chars",
13+
"type": "patch"
14+
},
15+
{
16+
"description": "Stabilize smoke tests for query context building",
17+
"type": "patch"
18+
},
19+
{
20+
"description": "fix query embedding",
21+
"type": "patch"
22+
},
23+
{
24+
"description": "fix sort_context & max_tokens params in verb",
25+
"type": "patch"
26+
}
27+
],
28+
"created_at": "2024-08-12T23:51:49+00:00",
29+
"version": "0.3.0"
30+
}

CHANGELOG.md

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,21 @@
11
# Changelog
2-
32
Note: version releases in the 0.x.y range may introduce breaking changes.
43

4+
## 0.3.0
5+
6+
- minor: Implement auto templating API.
7+
- minor: Implement query engine API.
8+
- patch: Fix file dumps using json for non ASCII chars
9+
- patch: Stabilize smoke tests for query context building
10+
- patch: fix query embedding
11+
- patch: fix sort_context & max_tokens params in verb
12+
513
## 0.2.2
614

715
- patch: Add a check if there is no community record added in local search context
816
- patch: Add sepparate workflow for Python Tests
917
- patch: Docs updates
18+
- patch: Run smoke tests on 4o
1019

1120
## 0.2.1
1221

CODEOWNERS

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,4 @@
22
# the repo. Unless a later match takes precedence,
33
# @global-owner1 and @global-owner2 will be requested for
44
# review when someone opens a pull request.
5-
* @microsoft/societal-resilience
6-
* @microsoft/graphrag-core-team
5+
* @microsoft/societal-resilience @microsoft/graphrag-core-team

graphrag/index/graph/extractors/community_reports/sort_context.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ def _get_context_string(
144144
new_context_string = _get_context_string(
145145
sorted_nodes, sorted_edges, sorted_claims, sub_community_reports
146146
)
147-
if num_tokens(context_string) > max_tokens:
147+
if num_tokens(new_context_string) > max_tokens:
148148
break
149149
context_string = new_context_string
150150

graphrag/index/graph/extractors/summarize/prompts.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
Given one or two entities, and a list of descriptions, all related to the same entity or group of entities.
99
Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions.
1010
If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary.
11-
Make sure it is written in third person, and include the entity names so we the have full context.
11+
Make sure it is written in third person, and include the entity names so we have the full context.
1212
1313
#######
1414
-Data-

graphrag/index/workflows/v1/create_final_community_reports.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ def build_steps(
1919
"""
2020
covariates_enabled = config.get("covariates_enabled", False)
2121
create_community_reports_config = config.get("create_community_reports", {})
22+
community_report_strategy = create_community_reports_config.get("strategy", {})
23+
community_report_max_input_length = community_report_strategy.get(
24+
"max_input_length", 16_000
25+
)
2226
base_text_embed = config.get("text_embed", {})
2327
community_report_full_content_embed_config = config.get(
2428
"community_report_full_content_embed", base_text_embed
@@ -77,6 +81,7 @@ def build_steps(
7781
{
7882
"id": "local_contexts",
7983
"verb": "prepare_community_reports",
84+
"args": {"max_tokens": community_report_max_input_length},
8085
"input": {
8186
"source": "nodes",
8287
"nodes": "nodes",

graphrag/prompt_tune/__main__.py

Lines changed: 39 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,48 @@
11
# Copyright (c) 2024 Microsoft Corporation.
22
# Licensed under the MIT License
33

4-
"""The Prompt auto templating package root."""
4+
"""The auto templating package root."""
55

66
import argparse
77
import asyncio
8-
from enum import Enum
9-
10-
from graphrag.prompt_tune.generator import MAX_TOKEN_COUNT
11-
from graphrag.prompt_tune.loader import MIN_CHUNK_SIZE
128

9+
from .api import DocSelectionType
1310
from .cli import prompt_tune
14-
15-
16-
class DocSelectionType(Enum):
17-
"""The type of document selection to use."""
18-
19-
ALL = "all"
20-
RANDOM = "random"
21-
TOP = "top"
22-
AUTO = "auto"
23-
24-
def __str__(self):
25-
"""Return the string representation of the enum value."""
26-
return self.value
27-
11+
from .generator import MAX_TOKEN_COUNT
12+
from .loader import MIN_CHUNK_SIZE
2813

2914
if __name__ == "__main__":
30-
parser = argparse.ArgumentParser()
15+
parser = argparse.ArgumentParser(
16+
prog="python -m graphrag.prompt_tune",
17+
description="The graphrag auto templating module.",
18+
)
19+
20+
parser.add_argument(
21+
"--config",
22+
help="Configuration yaml file to use when generating prompts",
23+
required=True,
24+
type=str,
25+
)
3126

3227
parser.add_argument(
3328
"--root",
34-
help="The data project root. Including the config yml, json or .env",
29+
help="Data project root. Default: current directory",
3530
required=False,
3631
type=str,
3732
default=".",
3833
)
3934

4035
parser.add_argument(
4136
"--domain",
42-
help="The domain your input data is related to. For example 'space science', 'microbiology', 'environmental news'. If left empty, the domain will be inferred from the input data.",
37+
help="Domain your input data is related to. For example 'space science', 'microbiology', 'environmental news'. If not defined, the domain will be inferred from the input data.",
4338
required=False,
4439
default="",
4540
type=str,
4641
)
4742

4843
parser.add_argument(
49-
"--method",
50-
help="The method to select documents, one of: all, random, top or auto",
44+
"--selection-method",
45+
help=f"Chunk selection method. Default: {DocSelectionType.RANDOM}",
5146
required=False,
5247
type=DocSelectionType,
5348
choices=list(DocSelectionType),
@@ -56,47 +51,47 @@ def __str__(self):
5651

5752
parser.add_argument(
5853
"--n_subset_max",
59-
help="The number of text chunks to embed when using auto selection method",
54+
help="Number of text chunks to embed when using auto selection method. Default: 300",
6055
required=False,
6156
type=int,
6257
default=300,
6358
)
6459

6560
parser.add_argument(
6661
"--k",
67-
help="The maximum number of documents to select from each centroid when using auto selection method",
62+
help="Maximum number of documents to select from each centroid when using auto selection method. Default: 15",
6863
required=False,
6964
type=int,
7065
default=15,
7166
)
7267

7368
parser.add_argument(
7469
"--limit",
75-
help="The limit of files to load when doing random or top selection",
70+
help="Number of documents to load when doing random or top selection. Default: 15",
7671
type=int,
7772
required=False,
7873
default=15,
7974
)
8075

8176
parser.add_argument(
8277
"--max-tokens",
83-
help="Max token count for prompt generation",
78+
help=f"Max token count for prompt generation. Default: {MAX_TOKEN_COUNT}",
8479
type=int,
8580
required=False,
8681
default=MAX_TOKEN_COUNT,
8782
)
8883

8984
parser.add_argument(
9085
"--min-examples-required",
91-
help="The minimum number of examples required in entity extraction prompt",
86+
help="Minimum number of examples required in the entity extraction prompt. Default: 2",
9287
type=int,
9388
required=False,
9489
default=2,
9590
)
9691

9792
parser.add_argument(
9893
"--chunk-size",
99-
help="Max token count for prompt generation",
94+
help=f"Max token count for prompt generation. Default: {MIN_CHUNK_SIZE}",
10095
type=int,
10196
required=False,
10297
default=MIN_CHUNK_SIZE,
@@ -120,7 +115,7 @@ def __str__(self):
120115

121116
parser.add_argument(
122117
"--output",
123-
help="Folder to save the generated prompts to",
118+
help="Directory to save generated prompts to. Default: 'prompts'",
124119
type=str,
125120
required=False,
126121
default="prompts",
@@ -132,17 +127,18 @@ def __str__(self):
132127

133128
loop.run_until_complete(
134129
prompt_tune(
135-
args.root,
136-
args.domain,
137-
str(args.method),
138-
args.limit,
139-
args.max_tokens,
140-
args.chunk_size,
141-
args.language,
142-
args.no_entity_types,
143-
args.output,
144-
args.n_subset_max,
145-
args.k,
146-
args.min_examples_required,
130+
config=args.config,
131+
root=args.root,
132+
domain=args.domain,
133+
selection_method=args.selection_method,
134+
limit=args.limit,
135+
max_tokens=args.max_tokens,
136+
chunk_size=args.chunk_size,
137+
language=args.language,
138+
skip_entity_types=args.no_entity_types,
139+
output=args.output,
140+
n_subset_max=args.n_subset_max,
141+
k=args.k,
142+
min_examples_required=args.min_examples_required,
147143
)
148144
)

0 commit comments

Comments
 (0)