Fix id baseline (#2036)
Some checks failed
gh-pages / build (push) Has been cancelled
Python CI / python-ci (ubuntu-latest, 3.10) (push) Has been cancelled
Python CI / python-ci (ubuntu-latest, 3.11) (push) Has been cancelled
Python CI / python-ci (windows-latest, 3.10) (push) Has been cancelled
Python CI / python-ci (windows-latest, 3.11) (push) Has been cancelled
Python Integration Tests / python-ci (ubuntu-latest, 3.10) (push) Has been cancelled
Python Integration Tests / python-ci (windows-latest, 3.10) (push) Has been cancelled
Python Notebook Tests / python-ci (ubuntu-latest, 3.10) (push) Has been cancelled
Python Notebook Tests / python-ci (windows-latest, 3.10) (push) Has been cancelled
Python Publish (pypi) / Upload release to PyPI (push) Has been cancelled
Python Smoke Tests / python-ci (ubuntu-latest, 3.10) (push) Has been cancelled
Python Smoke Tests / python-ci (windows-latest, 3.10) (push) Has been cancelled
Spellcheck / spellcheck (push) Has been cancelled

* Fix all human_readable_id columns to start at 0

* Semver
This commit is contained in:
Nathan Evans 2025-08-27 11:15:21 -07:00 committed by GitHub
parent 30bdb35cc8
commit 69ad36e735
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 10 additions and 12 deletions

View File

@ -0,0 +1,4 @@
{
"type": "patch",
"description": "Fix all human_readable_id fields to be 0-based."
}

View File

@ -69,7 +69,7 @@ def create_final_documents(
).reset_index(drop=True)
rejoined["id"] = rejoined["id"].astype(str)
rejoined["human_readable_id"] = rejoined.index + 1
rejoined["human_readable_id"] = rejoined.index
if "metadata" not in rejoined.columns:
rejoined["metadata"] = pd.Series(dtype="object")

View File

@ -60,7 +60,7 @@ def create_final_text_units(
) -> pd.DataFrame:
"""All the steps to transform the text units."""
selected = text_units.loc[:, ["id", "text", "document_ids", "n_tokens"]]
selected["human_readable_id"] = selected.index + 1
selected["human_readable_id"] = selected.index
entity_join = _entities(final_entities)
relationship_join = _relationships(final_relationships)

View File

@ -88,6 +88,6 @@ async def extract_covariates(
)
text_units.drop(columns=["text_unit_id"], inplace=True) # don't pollute the global
covariates["id"] = covariates["covariate_type"].apply(lambda _x: str(uuid4()))
covariates["human_readable_id"] = covariates.index + 1
covariates["human_readable_id"] = covariates.index
return covariates.loc[:, COVARIATES_FINAL_COLUMNS]

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -58,8 +58,8 @@ async def test_extract_covariates():
assert_series_equal(actual["text_unit_id"], input["id"], check_names=False)
# make sure the human ids are incrementing
assert actual["human_readable_id"][0] == 1
assert actual["human_readable_id"][1] == 2
assert actual["human_readable_id"][0] == 0
assert actual["human_readable_id"][1] == 1
# check that the mock data is parsed and inserted into the correct columns
assert actual["covariate_type"][0] == "claim"

View File

@ -30,9 +30,6 @@ async def test_finalize_graph():
"relationships", context.output_storage
)
assert len(nodes_actual) == 291
assert len(edges_actual) == 452
# x and y will be zero with the default configuration, because we do not embed/umap
assert nodes_actual["x"].sum() == 0
assert nodes_actual["y"].sum() == 0
@ -58,9 +55,6 @@ async def test_finalize_graph_umap():
"relationships", context.output_storage
)
assert len(nodes_actual) == 291
assert len(edges_actual) == 452
# x and y should have some value other than zero due to umap
assert nodes_actual["x"].sum() != 0
assert nodes_actual["y"].sum() != 0

View File

@ -28,4 +28,4 @@ async def test_prune_graph():
nodes_actual = await load_table_from_storage("entities", context.output_storage)
assert len(nodes_actual) == 21
assert len(nodes_actual) == 20