update ruff, move ruff settings to ruff.toml

This commit is contained in:
Chris Trevino 2024-10-30 12:55:37 -07:00
parent 7235c6faf5
commit 2f763f51c0
11 changed files with 543 additions and 523 deletions

11
.gitattributes vendored Normal file
View File

@ -0,0 +1,11 @@
*.txt text eol=lf
*.md text eol=lf
*.yml text eol=lf
*.html text eol=lf
*.py text eol=lf
*.toml text eol=lf
.gitattributes text eol=lf
.gitignore text eol=lf
*.lock
CODEOWNERS text eol=lf
LICENSE text eol=lf

27
.vscode/settings.json vendored
View File

@ -12,19 +12,11 @@
"typescript.preferences.importModuleSpecifierEnding": "js",
"explorer.fileNesting.enabled": true,
"explorer.fileNesting.patterns": {
"*.ts": "${capture}.ts, ${capture}.hooks.ts, ${capture}.hooks.tsx, ${capture}.contexts.ts, ${capture}.stories.tsx, ${capture}.story.tsx, ${capture}.spec.tsx, ${capture}.base.ts, ${capture}.base.tsx, ${capture}.types.ts, ${capture}.styles.ts, ${capture}.styles.tsx, ${capture}.utils.ts, ${capture}.utils.tsx, ${capture}.constants.ts, ${capture}.module.scss, ${capture}.module.css, ${capture}.md",
"*.js": "${capture}.js.map, ${capture}.min.js, ${capture}.d.ts",
"*.jsx": "${capture}.js",
"*.tsx": "${capture}.ts, ${capture}.hooks.ts, ${capture}.hooks.tsx, ${capture}.contexts.ts, ${capture}.stories.tsx, ${capture}.story.tsx, ${capture}.spec.tsx, ${capture}.base.ts, ${capture}.base.tsx, ${capture}.types.ts, ${capture}.styles.ts, ${capture}.styles.tsx, ${capture}.utils.ts, ${capture}.utils.tsx, ${capture}.constants.ts, ${capture}.module.scss, ${capture}.module.css, ${capture}.md, ${capture}.css",
"tsconfig.json": "tsconfig.*.json",
"package.json": "package-lock.json, turbo.json, tsconfig.json, rome.json, biome.json, .npmignore, dictionary.txt, cspell.config.yaml",
"README.md": "*.md, LICENSE, CODEOWNERS",
".eslintrc": ".eslintignore",
".prettierrc": ".prettierignore",
".gitattributes": ".gitignore",
".yarnrc.yml": "yarn.lock, .pnp.*",
"jest.config.js": "jest.setup.mjs",
"pyproject.toml": "poetry.lock, poetry.toml, mkdocs.yaml",
"pyproject.toml": "*.lock, *.toml, mkdocs.yaml",
"cspell.config.yaml": "dictionary.txt"
},
"azureFunctions.postDeployTask": "npm install (functions)",
@ -36,6 +28,19 @@
"node_modules{,/**}",
".vscode{,/**}"
],
"[python]": {
"editor.formatOnSave": true,
"editor.codeActionsOnSave": {
"source.organizeImports": "explicit",
"source.fixAll": "explicit"
},
"editor.defaultFormatter": "charliermarsh.ruff"
},
"notebook.formatOnSave.enabled": true,
"notebook.codeActionsOnSave": {
"notebook.source.fixAll": "explicit",
"notebook.source.organizeImports": "explicit"
},
"python.defaultInterpreterPath": "python/services/.venv/bin/python",
"python.languageServer": "Pylance",
"cSpell.customDictionaries": {
@ -47,5 +52,7 @@
},
"custom": true, // Enable the `custom` dictionary
"internal-terms": true // Disable the `internal-terms` dictionary
}
},
"ruff.configuration": "./ruff.toml",
"ruff.nativeServer": "on"
}

View File

@ -332,10 +332,10 @@ class GlobalSearch(BaseSearch[GlobalContextBuilder]):
for point in filtered_key_points:
formatted_response_data = []
formatted_response_data.append(
f'----Analyst {point["analyst"] + 1}----'
f"----Analyst {point['analyst'] + 1}----"
)
formatted_response_data.append(
f'Importance Score: {point["score"]}' # type: ignore
f"Importance Score: {point['score']}" # type: ignore
)
formatted_response_data.append(point["answer"]) # type: ignore
formatted_response_text = "\n".join(formatted_response_data)
@ -431,8 +431,8 @@ class GlobalSearch(BaseSearch[GlobalContextBuilder]):
total_tokens = 0
for point in filtered_key_points:
formatted_response_data = [
f'----Analyst {point["analyst"] + 1}----',
f'Importance Score: {point["score"]}',
f"----Analyst {point['analyst'] + 1}----",
f"Importance Score: {point['score']}",
point["answer"],
]
formatted_response_text = "\n".join(formatted_response_data)

748
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -99,11 +99,11 @@ ipykernel = "^6.29.4"
jupyter = "^1.1.1"
nbconvert = "^7.16.3"
poethepoet = "^0.28.0"
pyright = "^1.1.384"
pyright = "^1.1.387"
pytest = "^8.3.2"
pytest-asyncio = "^0.24.0"
pytest-timeout = "^2.3.1"
ruff = "^0.6.9"
ruff = "^0.7.1"
semversioner = "^2.0.3"
update-toml = "^0.2.1"
deptry = "^0.20.0"
@ -170,98 +170,6 @@ ignore_fail = 'return_non_zero'
sequence = ['_test_all', 'coverage_report']
ignore_fail = 'return_non_zero'
[tool.ruff]
target-version = "py310"
extend-include = ["*.ipynb"]
[tool.ruff.format]
preview = true
docstring-code-format = true
docstring-code-line-length = 20
[tool.ruff.lint]
preview = true
select = [
"E4",
"E7",
"E9",
"W291",
"YTT",
"T10",
"ICN",
"INP",
"Q",
"RSE",
"SLOT",
"INT",
"FLY",
"LOG",
"C90",
"T20",
"D",
"RET",
"PD",
"N",
"PIE",
"SIM",
"S",
"G",
"ERA",
"ASYNC",
"TID",
"UP",
"SLF",
"BLE",
"C4",
"I",
"F",
"A",
"ARG",
"PTH",
"RUF",
"B",
"TCH",
"DTZ",
"PYI",
"PT",
"EM",
"TRY",
"PERF",
"CPY",
# "FBT", # use named arguments for boolean flags
# "TD", # todos
# "FIX", # fixme
# "FURB" # preview rules
# ANN # Type annotations, re-enable when we get bandwidth
]
ignore = [
# Ignore module names shadowing Python builtins
"A005",
# Deprecated Rules
"ANN101",
"ANN102",
# Conflicts with interface argument checking
"ARG002",
"ANN204",
# TODO: Inspect these pandas rules for validity
"PD002", # prevents inplace=True
# TODO RE-Enable when we get bandwidth
"PERF203", # Needs restructuring of errors, we should bail-out on first error
"C901", # needs refactoring to remove cyclomatic complexity
]
[tool.ruff.lint.per-file-ignores]
"tests/*" = ["S", "D", "ANN", "T201", "ASYNC", "ARG", "PTH", "TRY"]
"examples/*" = ["S", "D", "ANN", "T201", "PTH", "TRY", "PERF"]
"graphrag/index/config/*" = ["TCH"]
"*.ipynb" = ["T201"]
[tool.ruff.lint.flake8-builtins]
builtins-ignorelist = ["input", "id", "bytes"]
[tool.ruff.lint.pydocstyle]
convention = "numpy"
# https://github.com/microsoft/pyright/blob/9f81564a4685ff5c55edd3959f9b39030f590b2f/docs/configuration.md#sample-pyprojecttoml-file
[tool.pyright]
include = ["graphrag", "tests", "examples", "examples_notebooks"]

91
ruff.toml Normal file
View File

@ -0,0 +1,91 @@
target-version = "py310"
extend-include = ["*.ipynb"]
[format]
preview = true
docstring-code-format = true
docstring-code-line-length = 20
[lint]
preview = true
select = [
"E4",
"E7",
"E9",
"W291",
"YTT",
"T10",
"ICN",
"INP",
"Q",
"RSE",
"SLOT",
"INT",
"FLY",
"LOG",
"C90",
"T20",
"D",
"RET",
"PD",
"N",
"PIE",
"SIM",
"S",
"G",
"ERA",
"ASYNC",
"TID",
"UP",
"SLF",
"BLE",
"C4",
"I",
"F",
"A",
"ARG",
"PTH",
"RUF",
"B",
"TCH",
"DTZ",
"PYI",
"PT",
"EM",
"TRY",
"PERF",
"CPY",
# "FBT", # use named arguments for boolean flags
# "TD", # todos
# "FIX", # fixme
# "FURB" # preview rules
# ANN # Type annotations, re-enable when we get bandwidth
]
ignore = [
# Ignore module names shadowing Python builtins
"A005",
# Deprecated Rules
"ANN101",
"ANN102",
# Conflicts with interface argument checking
"ARG002",
"ANN204",
# TODO: Inspect these pandas rules for validity
"PD002", # prevents inplace=True
# TODO RE-Enable when we get bandwidth
"PERF203", # Needs restructuring of errors, we should bail-out on first error
"C901", # needs refactoring to remove cyclomatic complexity
]
[lint.per-file-ignores]
"tests/*" = ["S", "D", "ANN", "T201", "ASYNC", "ARG", "PTH", "TRY"]
"examples/*" = ["S", "D", "ANN", "T201", "PTH", "TRY", "PERF"]
"graphrag/index/config/*" = ["TCH"]
"*.ipynb" = ["T201"]
[lint.flake8-builtins]
builtins-ignorelist = ["input", "id", "bytes"]
[lint.pydocstyle]
convention = "numpy"

View File

@ -66,10 +66,10 @@ class TestRun(unittest.IsolatedAsyncioTestCase):
entity_ids = set(entity_text_unit_map.keys())
for text_unit_id, text_unit_entities in text_unit_entity_map.items():
assert text_unit_entities.issubset(
entity_ids
), f"Text unit {text_unit_id} has entities {text_unit_entities} that are not in the entity set"
assert text_unit_entities.issubset(entity_ids), (
f"Text unit {text_unit_id} has entities {text_unit_entities} that are not in the entity set"
)
for entity_id, entity_text_units in entity_text_unit_map.items():
assert entity_text_units.issubset(
text_unit_ids
), f"Entity {entity_id} has text units {entity_text_units} that are not in the text unit set"
assert entity_text_units.issubset(text_unit_ids), (
f"Entity {entity_id} has text units {entity_text_units} that are not in the text unit set"
)

View File

@ -145,9 +145,9 @@ class TestIndexer:
completion = subprocess.run(
command, env={**os.environ, "GRAPHRAG_INPUT_FILE_TYPE": input_file_type}
)
assert (
completion.returncode == 0
), f"Indexer failed with return code: {completion.returncode}"
assert completion.returncode == 0, (
f"Indexer failed with return code: {completion.returncode}"
)
def __assert_indexer_outputs(
self, root: Path, workflow_config: dict[str, dict[str, Any]]
@ -158,9 +158,9 @@ class TestIndexer:
output_entries.sort(key=lambda entry: entry.stat().st_ctime, reverse=True)
if not debug:
assert (
len(output_entries) == 1
), f"Expected one output folder, found {len(output_entries)}"
assert len(output_entries) == 1, (
f"Expected one output folder, found {len(output_entries)}"
)
output_path = output_entries[0]
assert output_path.exists(), "output folder does not exist"
@ -174,18 +174,18 @@ class TestIndexer:
# Check all workflows run
expected_workflows = set(workflow_config.keys())
workflows = set(stats["workflows"].keys())
assert (
workflows == expected_workflows
), f"Workflows missing from stats.json: {expected_workflows - workflows}. Unexpected workflows in stats.json: {workflows - expected_workflows}"
assert workflows == expected_workflows, (
f"Workflows missing from stats.json: {expected_workflows - workflows}. Unexpected workflows in stats.json: {workflows - expected_workflows}"
)
# [OPTIONAL] Check runtime
for workflow in expected_workflows:
# Check max runtime
max_runtime = workflow_config[workflow].get("max_runtime", None)
if max_runtime:
assert (
stats["workflows"][workflow]["overall"] <= max_runtime
), f"Expected max runtime of {max_runtime}, found: {stats['workflows'][workflow]['overall']} for workflow: {workflow}"
assert stats["workflows"][workflow]["overall"] <= max_runtime, (
f"Expected max runtime of {max_runtime}, found: {stats['workflows'][workflow]['overall']} for workflow: {workflow}"
)
# Check artifacts
artifact_files = os.listdir(artifacts)
@ -195,10 +195,11 @@ class TestIndexer:
transient_workflows = [
"workflow:create_base_text_units",
]
assert (
len(artifact_files)
== (len(expected_workflows) - len(transient_workflows) + 1)
), f"Expected {len(expected_workflows) + 1} artifacts, found: {len(artifact_files)}"
assert len(artifact_files) == (
len(expected_workflows) - len(transient_workflows) + 1
), (
f"Expected {len(expected_workflows) + 1} artifacts, found: {len(artifact_files)}"
)
for artifact in artifact_files:
if artifact.endswith(".parquet"):
@ -211,16 +212,18 @@ class TestIndexer:
workflow["row_range"][0]
<= len(output_df)
<= workflow["row_range"][1]
), f"Expected between {workflow['row_range'][0]} and {workflow['row_range'][1]}, found: {len(output_df)} for file: {artifact}"
), (
f"Expected between {workflow['row_range'][0]} and {workflow['row_range'][1]}, found: {len(output_df)} for file: {artifact}"
)
# Get non-nan rows
nan_df = output_df.loc[
:, ~output_df.columns.isin(workflow.get("nan_allowed_columns", []))
]
nan_df = nan_df[nan_df.isna().any(axis=1)]
assert (
len(nan_df) == 0
), f"Found {len(nan_df)} rows with NaN values for file: {artifact} on columns: {nan_df.columns[nan_df.isna().any()].tolist()}"
assert len(nan_df) == 0, (
f"Found {len(nan_df)} rows with NaN values for file: {artifact} on columns: {nan_df.columns[nan_df.isna().any()].tolist()}"
)
def __run_query(self, root: Path, query_config: dict[str, str]):
command = [
@ -296,8 +299,8 @@ class TestIndexer:
result.stderr if "No existing dataset at" not in result.stderr else ""
)
assert (
stderror == "" or stderror.replace("\n", "") in KNOWN_WARNINGS
), f"Query failed with error: {stderror}"
assert stderror == "" or stderror.replace("\n", "") in KNOWN_WARNINGS, (
f"Query failed with error: {stderror}"
)
assert result.stdout is not None, "Query returned no output"
assert len(result.stdout) > 0, "Query returned empty output"

View File

@ -81,9 +81,9 @@ async def test_normal_result_emits_parquet():
]
assert len(pipeline_result) == 1
assert (
storage.keys() == ["stats.json", "mock_write", "mock_workflow.parquet"]
), "Mock workflow output should be written to storage by the emitter when there is a non-empty data frame"
assert storage.keys() == ["stats.json", "mock_write", "mock_workflow.parquet"], (
"Mock workflow output should be written to storage by the emitter when there is a non-empty data frame"
)
async def test_empty_result_does_not_emit_parquet():

View File

@ -74,9 +74,9 @@ async def test_create_base_entity_graph():
context=context,
)
assert len(actual.columns) == len(
expected.columns
), "Graph dataframe columns differ"
assert len(actual.columns) == len(expected.columns), (
"Graph dataframe columns differ"
)
# let's parse a sample of the raw graphml
actual_graphml_0 = actual["clustered_graph"][:1][0]
actual_graph_0 = nx.parse_graphml(actual_graphml_0)
@ -121,9 +121,9 @@ async def test_create_base_entity_graph_with_embeddings():
context=context,
)
assert (
len(actual.columns) == len(expected.columns) + 1
), "Graph dataframe missing embedding column"
assert len(actual.columns) == len(expected.columns) + 1, (
"Graph dataframe missing embedding column"
)
assert "embeddings" in actual.columns, "Graph dataframe missing embedding column"

View File

@ -86,9 +86,9 @@ def compare_outputs(
"""
cols = expected.columns if columns is None else columns
assert len(actual) == len(
expected
), f"Expected: {len(expected)} rows, Actual: {len(actual)} rows"
assert len(actual) == len(expected), (
f"Expected: {len(expected)} rows, Actual: {len(actual)} rows"
)
for column in cols:
assert column in actual.columns