update ruff, move ruff settings to ruff.toml

2026-02-22 10:44:59 +08:00 · 2024-10-30 12:55:37 -07:00 · 2024-10-30 12:55:37 -07:00 · 2f763f51c0
commit 2f763f51c0
parent 7235c6faf5
11 changed files with 543 additions and 523 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1,11 @@
+*.txt text eol=lf
+*.md text eol=lf
+*.yml text eol=lf
+*.html text eol=lf
+*.py text eol=lf
+*.toml text eol=lf
+.gitattributes text eol=lf
+.gitignore text eol=lf
+*.lock
+CODEOWNERS text eol=lf
+LICENSE text eol=lf
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -12,19 +12,11 @@
  "typescript.preferences.importModuleSpecifierEnding": "js",
  "explorer.fileNesting.enabled": true,
  "explorer.fileNesting.patterns": {
-    "*.ts": "${capture}.ts, ${capture}.hooks.ts, ${capture}.hooks.tsx, ${capture}.contexts.ts, ${capture}.stories.tsx, ${capture}.story.tsx, ${capture}.spec.tsx, ${capture}.base.ts, ${capture}.base.tsx, ${capture}.types.ts, ${capture}.styles.ts, ${capture}.styles.tsx, ${capture}.utils.ts, ${capture}.utils.tsx, ${capture}.constants.ts, ${capture}.module.scss, ${capture}.module.css, ${capture}.md",
-    "*.js": "${capture}.js.map, ${capture}.min.js, ${capture}.d.ts",
-    "*.jsx": "${capture}.js",
-    "*.tsx": "${capture}.ts, ${capture}.hooks.ts, ${capture}.hooks.tsx, ${capture}.contexts.ts, ${capture}.stories.tsx, ${capture}.story.tsx, ${capture}.spec.tsx, ${capture}.base.ts, ${capture}.base.tsx, ${capture}.types.ts, ${capture}.styles.ts, ${capture}.styles.tsx, ${capture}.utils.ts, ${capture}.utils.tsx, ${capture}.constants.ts, ${capture}.module.scss, ${capture}.module.css, ${capture}.md, ${capture}.css",
-    "tsconfig.json": "tsconfig.*.json",
-    "package.json": "package-lock.json, turbo.json, tsconfig.json, rome.json, biome.json, .npmignore, dictionary.txt, cspell.config.yaml",
    "README.md": "*.md, LICENSE, CODEOWNERS",
    ".eslintrc": ".eslintignore",
    ".prettierrc": ".prettierignore",
    ".gitattributes": ".gitignore",
-    ".yarnrc.yml": "yarn.lock, .pnp.*",
-    "jest.config.js": "jest.setup.mjs",
-    "pyproject.toml": "poetry.lock, poetry.toml, mkdocs.yaml",
+    "pyproject.toml": "*.lock, *.toml, mkdocs.yaml",  
    "cspell.config.yaml": "dictionary.txt"
  },
  "azureFunctions.postDeployTask": "npm install (functions)",
@ -36,6 +28,19 @@
    "node_modules{,/**}",
    ".vscode{,/**}"
  ],
+  "[python]": {
+		"editor.formatOnSave": true,
+		"editor.codeActionsOnSave": {
+			"source.organizeImports": "explicit",
+			"source.fixAll": "explicit"
+		},
+		"editor.defaultFormatter": "charliermarsh.ruff"
+	},
+  "notebook.formatOnSave.enabled": true,
+	"notebook.codeActionsOnSave": {
+    "notebook.source.fixAll": "explicit",
+    "notebook.source.organizeImports": "explicit"
+  },
  "python.defaultInterpreterPath": "python/services/.venv/bin/python",
  "python.languageServer": "Pylance",
  "cSpell.customDictionaries": {
@ -47,5 +52,7 @@
    },
    "custom": true, // Enable the `custom` dictionary
    "internal-terms": true // Disable the `internal-terms` dictionary
-  }
+  },
+	"ruff.configuration": "./ruff.toml",
+  "ruff.nativeServer": "on"
 }
--- a/graphrag/query/structured_search/global_search/search.py
+++ b/graphrag/query/structured_search/global_search/search.py
@ -332,10 +332,10 @@ class GlobalSearch(BaseSearch[GlobalContextBuilder]):
            for point in filtered_key_points:
                formatted_response_data = []
                formatted_response_data.append(
-                    f'----Analyst {point["analyst"] + 1}----'
+                    f"----Analyst {point['analyst'] + 1}----"
                )
                formatted_response_data.append(
-                    f'Importance Score: {point["score"]}'  # type: ignore
+                    f"Importance Score: {point['score']}"  # type: ignore
                )
                formatted_response_data.append(point["answer"])  # type: ignore
                formatted_response_text = "\n".join(formatted_response_data)
@ -431,8 +431,8 @@ class GlobalSearch(BaseSearch[GlobalContextBuilder]):
        total_tokens = 0
        for point in filtered_key_points:
            formatted_response_data = [
-                f'----Analyst {point["analyst"] + 1}----',
-                f'Importance Score: {point["score"]}',
+                f"----Analyst {point['analyst'] + 1}----",
+                f"Importance Score: {point['score']}",
                point["answer"],
            ]
            formatted_response_text = "\n".join(formatted_response_data)
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -99,11 +99,11 @@ ipykernel = "^6.29.4"
 jupyter = "^1.1.1"
 nbconvert = "^7.16.3"
 poethepoet = "^0.28.0"
-pyright = "^1.1.384"
+pyright = "^1.1.387"
 pytest = "^8.3.2"
 pytest-asyncio = "^0.24.0"
 pytest-timeout = "^2.3.1"
-ruff = "^0.6.9"
+ruff = "^0.7.1"
 semversioner = "^2.0.3"
 update-toml = "^0.2.1"
 deptry = "^0.20.0"
@ -170,98 +170,6 @@ ignore_fail = 'return_non_zero'
 sequence = ['_test_all', 'coverage_report']
 ignore_fail = 'return_non_zero'

-[tool.ruff]
-target-version = "py310"
-extend-include = ["*.ipynb"]
-
-[tool.ruff.format]
-preview = true
-docstring-code-format = true
-docstring-code-line-length = 20
-
-[tool.ruff.lint]
-preview = true
-select = [
-    "E4",
-    "E7",
-    "E9",
-    "W291",
-    "YTT",
-    "T10",
-    "ICN",
-    "INP",
-    "Q",
-    "RSE",
-    "SLOT",
-    "INT",
-    "FLY",
-    "LOG",
-    "C90",
-    "T20",
-    "D",
-    "RET",
-    "PD",
-    "N",
-    "PIE",
-    "SIM",
-    "S",
-    "G",
-    "ERA",
-    "ASYNC",
-    "TID",
-    "UP",
-    "SLF",
-    "BLE",
-    "C4",
-    "I",
-    "F",
-    "A",
-    "ARG",
-    "PTH",
-    "RUF",
-    "B",
-    "TCH",
-    "DTZ",
-    "PYI",
-    "PT",
-    "EM",
-    "TRY",
-    "PERF",
-    "CPY",
-    # "FBT", # use named arguments for boolean flags
-    # "TD", # todos
-    # "FIX", # fixme
-    # "FURB" # preview rules
-    # ANN # Type annotations, re-enable when we get bandwidth
-]
-ignore = [
-    # Ignore module names shadowing Python builtins
-    "A005",
-    # Deprecated Rules
-    "ANN101",
-    "ANN102",
-    # Conflicts with interface argument checking
-    "ARG002",
-    "ANN204",
-    # TODO: Inspect these pandas rules for validity
-    "PD002", # prevents inplace=True
-    # TODO RE-Enable when we get bandwidth
-    "PERF203", # Needs restructuring of errors, we should bail-out on first error
-    "C901",    # needs refactoring to remove cyclomatic complexity
-]
-
-[tool.ruff.lint.per-file-ignores]
-"tests/*" = ["S", "D", "ANN", "T201", "ASYNC", "ARG", "PTH", "TRY"]
-"examples/*" = ["S", "D", "ANN", "T201", "PTH", "TRY", "PERF"]
-"graphrag/index/config/*" = ["TCH"]
-"*.ipynb" = ["T201"]
-
-[tool.ruff.lint.flake8-builtins]
-builtins-ignorelist = ["input", "id", "bytes"]
-
-[tool.ruff.lint.pydocstyle]
-convention = "numpy"
-
 # https://github.com/microsoft/pyright/blob/9f81564a4685ff5c55edd3959f9b39030f590b2f/docs/configuration.md#sample-pyprojecttoml-file
 [tool.pyright]
 include = ["graphrag", "tests", "examples", "examples_notebooks"]
--- a/ruff.toml
+++ b/ruff.toml
@ -0,0 +1,91 @@
+
+target-version = "py310"
+extend-include = ["*.ipynb"]
+
+[format]
+preview = true
+docstring-code-format = true
+docstring-code-line-length = 20
+
+[lint]
+preview = true
+select = [
+    "E4",
+    "E7",
+    "E9",
+    "W291",
+    "YTT",
+    "T10",
+    "ICN",
+    "INP",
+    "Q",
+    "RSE",
+    "SLOT",
+    "INT",
+    "FLY",
+    "LOG",
+    "C90",
+    "T20",
+    "D",
+    "RET",
+    "PD",
+    "N",
+    "PIE",
+    "SIM",
+    "S",
+    "G",
+    "ERA",
+    "ASYNC",
+    "TID",
+    "UP",
+    "SLF",
+    "BLE",
+    "C4",
+    "I",
+    "F",
+    "A",
+    "ARG",
+    "PTH",
+    "RUF",
+    "B",
+    "TCH",
+    "DTZ",
+    "PYI",
+    "PT",
+    "EM",
+    "TRY",
+    "PERF",
+    "CPY",
+    # "FBT", # use named arguments for boolean flags
+    # "TD", # todos
+    # "FIX", # fixme
+    # "FURB" # preview rules
+    # ANN # Type annotations, re-enable when we get bandwidth
+]
+ignore = [
+    # Ignore module names shadowing Python builtins
+    "A005",
+    # Deprecated Rules
+    "ANN101",
+    "ANN102",
+    # Conflicts with interface argument checking
+    "ARG002",
+    "ANN204",
+    # TODO: Inspect these pandas rules for validity
+    "PD002", # prevents inplace=True
+    # TODO RE-Enable when we get bandwidth
+    "PERF203", # Needs restructuring of errors, we should bail-out on first error
+    "C901",    # needs refactoring to remove cyclomatic complexity
+]
+
+[lint.per-file-ignores]
+"tests/*" = ["S", "D", "ANN", "T201", "ASYNC", "ARG", "PTH", "TRY"]
+"examples/*" = ["S", "D", "ANN", "T201", "PTH", "TRY", "PERF"]
+"graphrag/index/config/*" = ["TCH"]
+"*.ipynb" = ["T201"]
+
+[lint.flake8-builtins]
+builtins-ignorelist = ["input", "id", "bytes"]
+
+[lint.pydocstyle]
+convention = "numpy"
--- a/tests/integration/_pipeline/test_run.py
+++ b/tests/integration/_pipeline/test_run.py
@ -66,10 +66,10 @@ class TestRun(unittest.IsolatedAsyncioTestCase):
        entity_ids = set(entity_text_unit_map.keys())

        for text_unit_id, text_unit_entities in text_unit_entity_map.items():
-            assert text_unit_entities.issubset(
-                entity_ids
-            ), f"Text unit {text_unit_id} has entities {text_unit_entities} that are not in the entity set"
+            assert text_unit_entities.issubset(entity_ids), (
+                f"Text unit {text_unit_id} has entities {text_unit_entities} that are not in the entity set"
+            )
        for entity_id, entity_text_units in entity_text_unit_map.items():
-            assert entity_text_units.issubset(
-                text_unit_ids
-            ), f"Entity {entity_id} has text units {entity_text_units} that are not in the text unit set"
+            assert entity_text_units.issubset(text_unit_ids), (
+                f"Entity {entity_id} has text units {entity_text_units} that are not in the text unit set"
+            )
--- a/tests/smoke/test_fixtures.py
+++ b/tests/smoke/test_fixtures.py
@ -145,9 +145,9 @@ class TestIndexer:
        completion = subprocess.run(
            command, env={**os.environ, "GRAPHRAG_INPUT_FILE_TYPE": input_file_type}
        )
-        assert (
-            completion.returncode == 0
-        ), f"Indexer failed with return code: {completion.returncode}"
+        assert completion.returncode == 0, (
+            f"Indexer failed with return code: {completion.returncode}"
+        )

    def __assert_indexer_outputs(
        self, root: Path, workflow_config: dict[str, dict[str, Any]]
@ -158,9 +158,9 @@ class TestIndexer:
        output_entries.sort(key=lambda entry: entry.stat().st_ctime, reverse=True)

        if not debug:
-            assert (
-                len(output_entries) == 1
-            ), f"Expected one output folder, found {len(output_entries)}"
+            assert len(output_entries) == 1, (
+                f"Expected one output folder, found {len(output_entries)}"
+            )

        output_path = output_entries[0]
        assert output_path.exists(), "output folder does not exist"
@ -174,18 +174,18 @@ class TestIndexer:
        # Check all workflows run
        expected_workflows = set(workflow_config.keys())
        workflows = set(stats["workflows"].keys())
-        assert (
-            workflows == expected_workflows
-        ), f"Workflows missing from stats.json: {expected_workflows - workflows}. Unexpected workflows in stats.json: {workflows - expected_workflows}"
+        assert workflows == expected_workflows, (
+            f"Workflows missing from stats.json: {expected_workflows - workflows}. Unexpected workflows in stats.json: {workflows - expected_workflows}"
+        )

        # [OPTIONAL] Check runtime
        for workflow in expected_workflows:
            # Check max runtime
            max_runtime = workflow_config[workflow].get("max_runtime", None)
            if max_runtime:
-                assert (
-                    stats["workflows"][workflow]["overall"] <= max_runtime
-                ), f"Expected max runtime of {max_runtime}, found: {stats['workflows'][workflow]['overall']} for workflow: {workflow}"
+                assert stats["workflows"][workflow]["overall"] <= max_runtime, (
+                    f"Expected max runtime of {max_runtime}, found: {stats['workflows'][workflow]['overall']} for workflow: {workflow}"
+                )

        # Check artifacts
        artifact_files = os.listdir(artifacts)
@ -195,10 +195,11 @@ class TestIndexer:
        transient_workflows = [
            "workflow:create_base_text_units",
        ]
-        assert (
-            len(artifact_files)
-            == (len(expected_workflows) - len(transient_workflows) + 1)
-        ), f"Expected {len(expected_workflows) + 1} artifacts, found: {len(artifact_files)}"
+        assert len(artifact_files) == (
+            len(expected_workflows) - len(transient_workflows) + 1
+        ), (
+            f"Expected {len(expected_workflows) + 1} artifacts, found: {len(artifact_files)}"
+        )

        for artifact in artifact_files:
            if artifact.endswith(".parquet"):
@ -211,16 +212,18 @@ class TestIndexer:
                    workflow["row_range"][0]
                    <= len(output_df)
                    <= workflow["row_range"][1]
-                ), f"Expected between {workflow['row_range'][0]} and {workflow['row_range'][1]}, found: {len(output_df)} for file: {artifact}"
+                ), (
+                    f"Expected between {workflow['row_range'][0]} and {workflow['row_range'][1]}, found: {len(output_df)} for file: {artifact}"
+                )

                # Get non-nan rows
                nan_df = output_df.loc[
                    :, ~output_df.columns.isin(workflow.get("nan_allowed_columns", []))
                ]
                nan_df = nan_df[nan_df.isna().any(axis=1)]
-                assert (
-                    len(nan_df) == 0
-                ), f"Found {len(nan_df)} rows with NaN values for file: {artifact} on columns: {nan_df.columns[nan_df.isna().any()].tolist()}"
+                assert len(nan_df) == 0, (
+                    f"Found {len(nan_df)} rows with NaN values for file: {artifact} on columns: {nan_df.columns[nan_df.isna().any()].tolist()}"
+                )

    def __run_query(self, root: Path, query_config: dict[str, str]):
        command = [
@ -296,8 +299,8 @@ class TestIndexer:
                result.stderr if "No existing dataset at" not in result.stderr else ""
            )

-            assert (
-                stderror == "" or stderror.replace("\n", "") in KNOWN_WARNINGS
-            ), f"Query failed with error: {stderror}"
+            assert stderror == "" or stderror.replace("\n", "") in KNOWN_WARNINGS, (
+                f"Query failed with error: {stderror}"
+            )
            assert result.stdout is not None, "Query returned no output"
            assert len(result.stdout) > 0, "Query returned empty output"
--- a/tests/unit/indexing/workflows/test_emit.py
+++ b/tests/unit/indexing/workflows/test_emit.py
@ -81,9 +81,9 @@ async def test_normal_result_emits_parquet():
    ]

    assert len(pipeline_result) == 1
-    assert (
-        storage.keys() == ["stats.json", "mock_write", "mock_workflow.parquet"]
-    ), "Mock workflow output should be written to storage by the emitter when there is a non-empty data frame"
+    assert storage.keys() == ["stats.json", "mock_write", "mock_workflow.parquet"], (
+        "Mock workflow output should be written to storage by the emitter when there is a non-empty data frame"
+    )


 async def test_empty_result_does_not_emit_parquet():
--- a/tests/verbs/test_create_base_entity_graph.py
+++ b/tests/verbs/test_create_base_entity_graph.py
@ -74,9 +74,9 @@ async def test_create_base_entity_graph():
        context=context,
    )

-    assert len(actual.columns) == len(
-        expected.columns
-    ), "Graph dataframe columns differ"
+    assert len(actual.columns) == len(expected.columns), (
+        "Graph dataframe columns differ"
+    )
    # let's parse a sample of the raw graphml
    actual_graphml_0 = actual["clustered_graph"][:1][0]
    actual_graph_0 = nx.parse_graphml(actual_graphml_0)
@ -121,9 +121,9 @@ async def test_create_base_entity_graph_with_embeddings():
        context=context,
    )

-    assert (
-        len(actual.columns) == len(expected.columns) + 1
-    ), "Graph dataframe missing embedding column"
+    assert len(actual.columns) == len(expected.columns) + 1, (
+        "Graph dataframe missing embedding column"
+    )
    assert "embeddings" in actual.columns, "Graph dataframe missing embedding column"


--- a/tests/verbs/util.py
+++ b/tests/verbs/util.py
@ -86,9 +86,9 @@ def compare_outputs(
    """
    cols = expected.columns if columns is None else columns

-    assert len(actual) == len(
-        expected
-    ), f"Expected: {len(expected)} rows, Actual: {len(actual)} rows"
+    assert len(actual) == len(expected), (
+        f"Expected: {len(expected)} rows, Actual: {len(actual)} rows"
+    )

    for column in cols:
        assert column in actual.columns