mirror of
https://github.com/microsoft/graphrag.git
synced 2026-02-22 10:44:59 +08:00
update ruff, move ruff settings to ruff.toml
This commit is contained in:
parent
7235c6faf5
commit
2f763f51c0
11
.gitattributes
vendored
Normal file
11
.gitattributes
vendored
Normal file
@ -0,0 +1,11 @@
|
||||
*.txt text eol=lf
|
||||
*.md text eol=lf
|
||||
*.yml text eol=lf
|
||||
*.html text eol=lf
|
||||
*.py text eol=lf
|
||||
*.toml text eol=lf
|
||||
.gitattributes text eol=lf
|
||||
.gitignore text eol=lf
|
||||
*.lock
|
||||
CODEOWNERS text eol=lf
|
||||
LICENSE text eol=lf
|
||||
27
.vscode/settings.json
vendored
27
.vscode/settings.json
vendored
@ -12,19 +12,11 @@
|
||||
"typescript.preferences.importModuleSpecifierEnding": "js",
|
||||
"explorer.fileNesting.enabled": true,
|
||||
"explorer.fileNesting.patterns": {
|
||||
"*.ts": "${capture}.ts, ${capture}.hooks.ts, ${capture}.hooks.tsx, ${capture}.contexts.ts, ${capture}.stories.tsx, ${capture}.story.tsx, ${capture}.spec.tsx, ${capture}.base.ts, ${capture}.base.tsx, ${capture}.types.ts, ${capture}.styles.ts, ${capture}.styles.tsx, ${capture}.utils.ts, ${capture}.utils.tsx, ${capture}.constants.ts, ${capture}.module.scss, ${capture}.module.css, ${capture}.md",
|
||||
"*.js": "${capture}.js.map, ${capture}.min.js, ${capture}.d.ts",
|
||||
"*.jsx": "${capture}.js",
|
||||
"*.tsx": "${capture}.ts, ${capture}.hooks.ts, ${capture}.hooks.tsx, ${capture}.contexts.ts, ${capture}.stories.tsx, ${capture}.story.tsx, ${capture}.spec.tsx, ${capture}.base.ts, ${capture}.base.tsx, ${capture}.types.ts, ${capture}.styles.ts, ${capture}.styles.tsx, ${capture}.utils.ts, ${capture}.utils.tsx, ${capture}.constants.ts, ${capture}.module.scss, ${capture}.module.css, ${capture}.md, ${capture}.css",
|
||||
"tsconfig.json": "tsconfig.*.json",
|
||||
"package.json": "package-lock.json, turbo.json, tsconfig.json, rome.json, biome.json, .npmignore, dictionary.txt, cspell.config.yaml",
|
||||
"README.md": "*.md, LICENSE, CODEOWNERS",
|
||||
".eslintrc": ".eslintignore",
|
||||
".prettierrc": ".prettierignore",
|
||||
".gitattributes": ".gitignore",
|
||||
".yarnrc.yml": "yarn.lock, .pnp.*",
|
||||
"jest.config.js": "jest.setup.mjs",
|
||||
"pyproject.toml": "poetry.lock, poetry.toml, mkdocs.yaml",
|
||||
"pyproject.toml": "*.lock, *.toml, mkdocs.yaml",
|
||||
"cspell.config.yaml": "dictionary.txt"
|
||||
},
|
||||
"azureFunctions.postDeployTask": "npm install (functions)",
|
||||
@ -36,6 +28,19 @@
|
||||
"node_modules{,/**}",
|
||||
".vscode{,/**}"
|
||||
],
|
||||
"[python]": {
|
||||
"editor.formatOnSave": true,
|
||||
"editor.codeActionsOnSave": {
|
||||
"source.organizeImports": "explicit",
|
||||
"source.fixAll": "explicit"
|
||||
},
|
||||
"editor.defaultFormatter": "charliermarsh.ruff"
|
||||
},
|
||||
"notebook.formatOnSave.enabled": true,
|
||||
"notebook.codeActionsOnSave": {
|
||||
"notebook.source.fixAll": "explicit",
|
||||
"notebook.source.organizeImports": "explicit"
|
||||
},
|
||||
"python.defaultInterpreterPath": "python/services/.venv/bin/python",
|
||||
"python.languageServer": "Pylance",
|
||||
"cSpell.customDictionaries": {
|
||||
@ -47,5 +52,7 @@
|
||||
},
|
||||
"custom": true, // Enable the `custom` dictionary
|
||||
"internal-terms": true // Disable the `internal-terms` dictionary
|
||||
}
|
||||
},
|
||||
"ruff.configuration": "./ruff.toml",
|
||||
"ruff.nativeServer": "on"
|
||||
}
|
||||
|
||||
@ -332,10 +332,10 @@ class GlobalSearch(BaseSearch[GlobalContextBuilder]):
|
||||
for point in filtered_key_points:
|
||||
formatted_response_data = []
|
||||
formatted_response_data.append(
|
||||
f'----Analyst {point["analyst"] + 1}----'
|
||||
f"----Analyst {point['analyst'] + 1}----"
|
||||
)
|
||||
formatted_response_data.append(
|
||||
f'Importance Score: {point["score"]}' # type: ignore
|
||||
f"Importance Score: {point['score']}" # type: ignore
|
||||
)
|
||||
formatted_response_data.append(point["answer"]) # type: ignore
|
||||
formatted_response_text = "\n".join(formatted_response_data)
|
||||
@ -431,8 +431,8 @@ class GlobalSearch(BaseSearch[GlobalContextBuilder]):
|
||||
total_tokens = 0
|
||||
for point in filtered_key_points:
|
||||
formatted_response_data = [
|
||||
f'----Analyst {point["analyst"] + 1}----',
|
||||
f'Importance Score: {point["score"]}',
|
||||
f"----Analyst {point['analyst'] + 1}----",
|
||||
f"Importance Score: {point['score']}",
|
||||
point["answer"],
|
||||
]
|
||||
formatted_response_text = "\n".join(formatted_response_data)
|
||||
|
||||
748
poetry.lock
generated
748
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -99,11 +99,11 @@ ipykernel = "^6.29.4"
|
||||
jupyter = "^1.1.1"
|
||||
nbconvert = "^7.16.3"
|
||||
poethepoet = "^0.28.0"
|
||||
pyright = "^1.1.384"
|
||||
pyright = "^1.1.387"
|
||||
pytest = "^8.3.2"
|
||||
pytest-asyncio = "^0.24.0"
|
||||
pytest-timeout = "^2.3.1"
|
||||
ruff = "^0.6.9"
|
||||
ruff = "^0.7.1"
|
||||
semversioner = "^2.0.3"
|
||||
update-toml = "^0.2.1"
|
||||
deptry = "^0.20.0"
|
||||
@ -170,98 +170,6 @@ ignore_fail = 'return_non_zero'
|
||||
sequence = ['_test_all', 'coverage_report']
|
||||
ignore_fail = 'return_non_zero'
|
||||
|
||||
[tool.ruff]
|
||||
target-version = "py310"
|
||||
extend-include = ["*.ipynb"]
|
||||
|
||||
[tool.ruff.format]
|
||||
preview = true
|
||||
docstring-code-format = true
|
||||
docstring-code-line-length = 20
|
||||
|
||||
[tool.ruff.lint]
|
||||
preview = true
|
||||
select = [
|
||||
"E4",
|
||||
"E7",
|
||||
"E9",
|
||||
"W291",
|
||||
"YTT",
|
||||
"T10",
|
||||
"ICN",
|
||||
"INP",
|
||||
"Q",
|
||||
"RSE",
|
||||
"SLOT",
|
||||
"INT",
|
||||
"FLY",
|
||||
"LOG",
|
||||
"C90",
|
||||
"T20",
|
||||
"D",
|
||||
"RET",
|
||||
"PD",
|
||||
"N",
|
||||
"PIE",
|
||||
"SIM",
|
||||
"S",
|
||||
"G",
|
||||
"ERA",
|
||||
"ASYNC",
|
||||
"TID",
|
||||
"UP",
|
||||
"SLF",
|
||||
"BLE",
|
||||
"C4",
|
||||
"I",
|
||||
"F",
|
||||
"A",
|
||||
"ARG",
|
||||
"PTH",
|
||||
"RUF",
|
||||
"B",
|
||||
"TCH",
|
||||
"DTZ",
|
||||
"PYI",
|
||||
"PT",
|
||||
"EM",
|
||||
"TRY",
|
||||
"PERF",
|
||||
"CPY",
|
||||
# "FBT", # use named arguments for boolean flags
|
||||
# "TD", # todos
|
||||
# "FIX", # fixme
|
||||
# "FURB" # preview rules
|
||||
# ANN # Type annotations, re-enable when we get bandwidth
|
||||
]
|
||||
ignore = [
|
||||
# Ignore module names shadowing Python builtins
|
||||
"A005",
|
||||
# Deprecated Rules
|
||||
"ANN101",
|
||||
"ANN102",
|
||||
# Conflicts with interface argument checking
|
||||
"ARG002",
|
||||
"ANN204",
|
||||
# TODO: Inspect these pandas rules for validity
|
||||
"PD002", # prevents inplace=True
|
||||
# TODO RE-Enable when we get bandwidth
|
||||
"PERF203", # Needs restructuring of errors, we should bail-out on first error
|
||||
"C901", # needs refactoring to remove cyclomatic complexity
|
||||
]
|
||||
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
"tests/*" = ["S", "D", "ANN", "T201", "ASYNC", "ARG", "PTH", "TRY"]
|
||||
"examples/*" = ["S", "D", "ANN", "T201", "PTH", "TRY", "PERF"]
|
||||
"graphrag/index/config/*" = ["TCH"]
|
||||
"*.ipynb" = ["T201"]
|
||||
|
||||
[tool.ruff.lint.flake8-builtins]
|
||||
builtins-ignorelist = ["input", "id", "bytes"]
|
||||
|
||||
[tool.ruff.lint.pydocstyle]
|
||||
convention = "numpy"
|
||||
|
||||
# https://github.com/microsoft/pyright/blob/9f81564a4685ff5c55edd3959f9b39030f590b2f/docs/configuration.md#sample-pyprojecttoml-file
|
||||
[tool.pyright]
|
||||
include = ["graphrag", "tests", "examples", "examples_notebooks"]
|
||||
|
||||
91
ruff.toml
Normal file
91
ruff.toml
Normal file
@ -0,0 +1,91 @@
|
||||
|
||||
target-version = "py310"
|
||||
extend-include = ["*.ipynb"]
|
||||
|
||||
[format]
|
||||
preview = true
|
||||
docstring-code-format = true
|
||||
docstring-code-line-length = 20
|
||||
|
||||
[lint]
|
||||
preview = true
|
||||
select = [
|
||||
"E4",
|
||||
"E7",
|
||||
"E9",
|
||||
"W291",
|
||||
"YTT",
|
||||
"T10",
|
||||
"ICN",
|
||||
"INP",
|
||||
"Q",
|
||||
"RSE",
|
||||
"SLOT",
|
||||
"INT",
|
||||
"FLY",
|
||||
"LOG",
|
||||
"C90",
|
||||
"T20",
|
||||
"D",
|
||||
"RET",
|
||||
"PD",
|
||||
"N",
|
||||
"PIE",
|
||||
"SIM",
|
||||
"S",
|
||||
"G",
|
||||
"ERA",
|
||||
"ASYNC",
|
||||
"TID",
|
||||
"UP",
|
||||
"SLF",
|
||||
"BLE",
|
||||
"C4",
|
||||
"I",
|
||||
"F",
|
||||
"A",
|
||||
"ARG",
|
||||
"PTH",
|
||||
"RUF",
|
||||
"B",
|
||||
"TCH",
|
||||
"DTZ",
|
||||
"PYI",
|
||||
"PT",
|
||||
"EM",
|
||||
"TRY",
|
||||
"PERF",
|
||||
"CPY",
|
||||
# "FBT", # use named arguments for boolean flags
|
||||
# "TD", # todos
|
||||
# "FIX", # fixme
|
||||
# "FURB" # preview rules
|
||||
# ANN # Type annotations, re-enable when we get bandwidth
|
||||
]
|
||||
ignore = [
|
||||
# Ignore module names shadowing Python builtins
|
||||
"A005",
|
||||
# Deprecated Rules
|
||||
"ANN101",
|
||||
"ANN102",
|
||||
# Conflicts with interface argument checking
|
||||
"ARG002",
|
||||
"ANN204",
|
||||
# TODO: Inspect these pandas rules for validity
|
||||
"PD002", # prevents inplace=True
|
||||
# TODO RE-Enable when we get bandwidth
|
||||
"PERF203", # Needs restructuring of errors, we should bail-out on first error
|
||||
"C901", # needs refactoring to remove cyclomatic complexity
|
||||
]
|
||||
|
||||
[lint.per-file-ignores]
|
||||
"tests/*" = ["S", "D", "ANN", "T201", "ASYNC", "ARG", "PTH", "TRY"]
|
||||
"examples/*" = ["S", "D", "ANN", "T201", "PTH", "TRY", "PERF"]
|
||||
"graphrag/index/config/*" = ["TCH"]
|
||||
"*.ipynb" = ["T201"]
|
||||
|
||||
[lint.flake8-builtins]
|
||||
builtins-ignorelist = ["input", "id", "bytes"]
|
||||
|
||||
[lint.pydocstyle]
|
||||
convention = "numpy"
|
||||
@ -66,10 +66,10 @@ class TestRun(unittest.IsolatedAsyncioTestCase):
|
||||
entity_ids = set(entity_text_unit_map.keys())
|
||||
|
||||
for text_unit_id, text_unit_entities in text_unit_entity_map.items():
|
||||
assert text_unit_entities.issubset(
|
||||
entity_ids
|
||||
), f"Text unit {text_unit_id} has entities {text_unit_entities} that are not in the entity set"
|
||||
assert text_unit_entities.issubset(entity_ids), (
|
||||
f"Text unit {text_unit_id} has entities {text_unit_entities} that are not in the entity set"
|
||||
)
|
||||
for entity_id, entity_text_units in entity_text_unit_map.items():
|
||||
assert entity_text_units.issubset(
|
||||
text_unit_ids
|
||||
), f"Entity {entity_id} has text units {entity_text_units} that are not in the text unit set"
|
||||
assert entity_text_units.issubset(text_unit_ids), (
|
||||
f"Entity {entity_id} has text units {entity_text_units} that are not in the text unit set"
|
||||
)
|
||||
|
||||
@ -145,9 +145,9 @@ class TestIndexer:
|
||||
completion = subprocess.run(
|
||||
command, env={**os.environ, "GRAPHRAG_INPUT_FILE_TYPE": input_file_type}
|
||||
)
|
||||
assert (
|
||||
completion.returncode == 0
|
||||
), f"Indexer failed with return code: {completion.returncode}"
|
||||
assert completion.returncode == 0, (
|
||||
f"Indexer failed with return code: {completion.returncode}"
|
||||
)
|
||||
|
||||
def __assert_indexer_outputs(
|
||||
self, root: Path, workflow_config: dict[str, dict[str, Any]]
|
||||
@ -158,9 +158,9 @@ class TestIndexer:
|
||||
output_entries.sort(key=lambda entry: entry.stat().st_ctime, reverse=True)
|
||||
|
||||
if not debug:
|
||||
assert (
|
||||
len(output_entries) == 1
|
||||
), f"Expected one output folder, found {len(output_entries)}"
|
||||
assert len(output_entries) == 1, (
|
||||
f"Expected one output folder, found {len(output_entries)}"
|
||||
)
|
||||
|
||||
output_path = output_entries[0]
|
||||
assert output_path.exists(), "output folder does not exist"
|
||||
@ -174,18 +174,18 @@ class TestIndexer:
|
||||
# Check all workflows run
|
||||
expected_workflows = set(workflow_config.keys())
|
||||
workflows = set(stats["workflows"].keys())
|
||||
assert (
|
||||
workflows == expected_workflows
|
||||
), f"Workflows missing from stats.json: {expected_workflows - workflows}. Unexpected workflows in stats.json: {workflows - expected_workflows}"
|
||||
assert workflows == expected_workflows, (
|
||||
f"Workflows missing from stats.json: {expected_workflows - workflows}. Unexpected workflows in stats.json: {workflows - expected_workflows}"
|
||||
)
|
||||
|
||||
# [OPTIONAL] Check runtime
|
||||
for workflow in expected_workflows:
|
||||
# Check max runtime
|
||||
max_runtime = workflow_config[workflow].get("max_runtime", None)
|
||||
if max_runtime:
|
||||
assert (
|
||||
stats["workflows"][workflow]["overall"] <= max_runtime
|
||||
), f"Expected max runtime of {max_runtime}, found: {stats['workflows'][workflow]['overall']} for workflow: {workflow}"
|
||||
assert stats["workflows"][workflow]["overall"] <= max_runtime, (
|
||||
f"Expected max runtime of {max_runtime}, found: {stats['workflows'][workflow]['overall']} for workflow: {workflow}"
|
||||
)
|
||||
|
||||
# Check artifacts
|
||||
artifact_files = os.listdir(artifacts)
|
||||
@ -195,10 +195,11 @@ class TestIndexer:
|
||||
transient_workflows = [
|
||||
"workflow:create_base_text_units",
|
||||
]
|
||||
assert (
|
||||
len(artifact_files)
|
||||
== (len(expected_workflows) - len(transient_workflows) + 1)
|
||||
), f"Expected {len(expected_workflows) + 1} artifacts, found: {len(artifact_files)}"
|
||||
assert len(artifact_files) == (
|
||||
len(expected_workflows) - len(transient_workflows) + 1
|
||||
), (
|
||||
f"Expected {len(expected_workflows) + 1} artifacts, found: {len(artifact_files)}"
|
||||
)
|
||||
|
||||
for artifact in artifact_files:
|
||||
if artifact.endswith(".parquet"):
|
||||
@ -211,16 +212,18 @@ class TestIndexer:
|
||||
workflow["row_range"][0]
|
||||
<= len(output_df)
|
||||
<= workflow["row_range"][1]
|
||||
), f"Expected between {workflow['row_range'][0]} and {workflow['row_range'][1]}, found: {len(output_df)} for file: {artifact}"
|
||||
), (
|
||||
f"Expected between {workflow['row_range'][0]} and {workflow['row_range'][1]}, found: {len(output_df)} for file: {artifact}"
|
||||
)
|
||||
|
||||
# Get non-nan rows
|
||||
nan_df = output_df.loc[
|
||||
:, ~output_df.columns.isin(workflow.get("nan_allowed_columns", []))
|
||||
]
|
||||
nan_df = nan_df[nan_df.isna().any(axis=1)]
|
||||
assert (
|
||||
len(nan_df) == 0
|
||||
), f"Found {len(nan_df)} rows with NaN values for file: {artifact} on columns: {nan_df.columns[nan_df.isna().any()].tolist()}"
|
||||
assert len(nan_df) == 0, (
|
||||
f"Found {len(nan_df)} rows with NaN values for file: {artifact} on columns: {nan_df.columns[nan_df.isna().any()].tolist()}"
|
||||
)
|
||||
|
||||
def __run_query(self, root: Path, query_config: dict[str, str]):
|
||||
command = [
|
||||
@ -296,8 +299,8 @@ class TestIndexer:
|
||||
result.stderr if "No existing dataset at" not in result.stderr else ""
|
||||
)
|
||||
|
||||
assert (
|
||||
stderror == "" or stderror.replace("\n", "") in KNOWN_WARNINGS
|
||||
), f"Query failed with error: {stderror}"
|
||||
assert stderror == "" or stderror.replace("\n", "") in KNOWN_WARNINGS, (
|
||||
f"Query failed with error: {stderror}"
|
||||
)
|
||||
assert result.stdout is not None, "Query returned no output"
|
||||
assert len(result.stdout) > 0, "Query returned empty output"
|
||||
|
||||
@ -81,9 +81,9 @@ async def test_normal_result_emits_parquet():
|
||||
]
|
||||
|
||||
assert len(pipeline_result) == 1
|
||||
assert (
|
||||
storage.keys() == ["stats.json", "mock_write", "mock_workflow.parquet"]
|
||||
), "Mock workflow output should be written to storage by the emitter when there is a non-empty data frame"
|
||||
assert storage.keys() == ["stats.json", "mock_write", "mock_workflow.parquet"], (
|
||||
"Mock workflow output should be written to storage by the emitter when there is a non-empty data frame"
|
||||
)
|
||||
|
||||
|
||||
async def test_empty_result_does_not_emit_parquet():
|
||||
|
||||
@ -74,9 +74,9 @@ async def test_create_base_entity_graph():
|
||||
context=context,
|
||||
)
|
||||
|
||||
assert len(actual.columns) == len(
|
||||
expected.columns
|
||||
), "Graph dataframe columns differ"
|
||||
assert len(actual.columns) == len(expected.columns), (
|
||||
"Graph dataframe columns differ"
|
||||
)
|
||||
# let's parse a sample of the raw graphml
|
||||
actual_graphml_0 = actual["clustered_graph"][:1][0]
|
||||
actual_graph_0 = nx.parse_graphml(actual_graphml_0)
|
||||
@ -121,9 +121,9 @@ async def test_create_base_entity_graph_with_embeddings():
|
||||
context=context,
|
||||
)
|
||||
|
||||
assert (
|
||||
len(actual.columns) == len(expected.columns) + 1
|
||||
), "Graph dataframe missing embedding column"
|
||||
assert len(actual.columns) == len(expected.columns) + 1, (
|
||||
"Graph dataframe missing embedding column"
|
||||
)
|
||||
assert "embeddings" in actual.columns, "Graph dataframe missing embedding column"
|
||||
|
||||
|
||||
|
||||
@ -86,9 +86,9 @@ def compare_outputs(
|
||||
"""
|
||||
cols = expected.columns if columns is None else columns
|
||||
|
||||
assert len(actual) == len(
|
||||
expected
|
||||
), f"Expected: {len(expected)} rows, Actual: {len(actual)} rows"
|
||||
assert len(actual) == len(expected), (
|
||||
f"Expected: {len(expected)} rows, Actual: {len(actual)} rows"
|
||||
)
|
||||
|
||||
for column in cols:
|
||||
assert column in actual.columns
|
||||
|
||||
Loading…
Reference in New Issue
Block a user