Feat: Store descriptive metrics identified by pipeline run id [cog-1260] (#582)

alekszievr · dexters1 · borisarzentar · web-flow · commit 6d7a68dbbad3 · 2025-03-03T19:09:35.000+01:00
&lt;!-- .github/pull_request_template.md --&gt;

## Description
&lt;!-- Provide a clear description of the changes in this PR --&gt;

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

- **New Features**
- Introduced a new analytic capability that calculates descriptive graph
metrics for pipeline runs when enabled.
- Updated the execution flow to include an option for activating the
graph metrics step.

- **Chores**
- Removed the previous mechanism for storing descriptive metrics to
streamline the system.
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;

---------

Co-authored-by: Igor Ilic &lt;30923996+dexters1@users.noreply.github.com&gt;
Co-authored-by: Boris &lt;boris@topoteretes.com&gt;
diff --git a/cognee/__init__.py b/cognee/__init__.py
@@ -8,6 +8,7 @@
 from cognee.modules.visualization.cognee_network_visualization import (
     cognee_network_visualization,
 )
+from .modules.data.operations.get_pipeline_run_metrics import get_pipeline_run_metrics
 
 # Pipelines
 from .modules import pipelines
diff --git a/cognee/modules/data/methods/__init__.py b/cognee/modules/data/methods/__init__.py
@@ -11,5 +11,3 @@
 # Delete
 from .delete_dataset import delete_dataset
 from .delete_data import delete_data
-
-from .store_descriptive_metrics import store_descriptive_metrics
diff --git a/cognee/modules/data/methods/store_descriptive_metrics.py b/cognee/modules/data/methods/store_descriptive_metrics.py
diff --git a/cognee/modules/data/operations/get_pipeline_run_metrics.py b/cognee/modules/data/operations/get_pipeline_run_metrics.py
@@ -0,0 +1,60 @@
+from cognee.infrastructure.databases.relational import get_relational_engine
+from sqlalchemy import select
+from sqlalchemy.sql import func
+from cognee.modules.data.models import Data
+from cognee.modules.data.models import GraphMetrics
+from cognee.infrastructure.databases.graph import get_graph_engine
+from cognee.modules.pipelines.models import PipelineRun
+
+
+async def fetch_token_count(db_engine) -> int:
+    """
+    Fetches and sums token counts from the database.
+
+    Returns:
+        int: The total number of tokens across all documents.
+    """
+
+    async with db_engine.get_async_session() as session:
+        token_count_sum = await session.execute(select(func.sum(Data.token_count)))
+        token_count_sum = token_count_sum.scalar()
+
+    return token_count_sum
+
+
+async def get_pipeline_run_metrics(pipeline_runs: list[PipelineRun], include_optional: bool):
+    db_engine = get_relational_engine()
+    graph_engine = await get_graph_engine()
+
+    metrics_for_pipeline_runs = []
+
+    async with db_engine.get_async_session() as session:
+        for pipeline_run in pipeline_runs:
+            existing_metrics = await session.execute(
+                select(GraphMetrics).where(GraphMetrics.id == pipeline_run.pipeline_run_id)
+            )
+            existing_metrics = existing_metrics.scalars().first()
+
+            if existing_metrics:
+                metrics_for_pipeline_runs.append(existing_metrics)
+            else:
+                graph_metrics = await graph_engine.get_graph_metrics(include_optional)
+                metrics = GraphMetrics(
+                    id=pipeline_run.pipeline_run_id,
+                    num_tokens=await fetch_token_count(db_engine),
+                    num_nodes=graph_metrics["num_nodes"],
+                    num_edges=graph_metrics["num_edges"],
+                    mean_degree=graph_metrics["mean_degree"],
+                    edge_density=graph_metrics["edge_density"],
+                    num_connected_components=graph_metrics["num_connected_components"],
+                    sizes_of_connected_components=graph_metrics["sizes_of_connected_components"],
+                    num_selfloops=graph_metrics["num_selfloops"],
+                    diameter=graph_metrics["diameter"],
+                    avg_shortest_path_length=graph_metrics["avg_shortest_path_length"],
+                    avg_clustering=graph_metrics["avg_clustering"],
+                )
+                metrics_for_pipeline_runs.append(metrics)
+                session.add(metrics)
+        await session.commit()
+
+    return metrics_for_pipeline_runs
diff --git a/examples/python/dynamic_steps_example.py b/examples/python/dynamic_steps_example.py
@@ -180,10 +180,15 @@ async def main(enable_steps):
 
     # Step 3: Create knowledge graph
     if enable_steps.get("cognify"):
-        await cognee.cognify()
+        pipeline_run = await cognee.cognify()
         print("Knowledge graph created.")
 
-    # Step 4: Query insights
+    # Step 4: Calculate descriptive metrics
+    if enable_steps.get("graph_metrics"):
+        await cognee.get_pipeline_run_metrics(pipeline_run, include_optional=True)
+        print("Descriptive graph metrics saved to database.")
+
+    # Step 5: Query insights
     if enable_steps.get("retriever"):
         search_results = await cognee.search(
             query_type=SearchType.GRAPH_COMPLETION, query_text="Who has experience in design tools?"
@@ -201,6 +206,7 @@ async def main(enable_steps):
         "prune_system": rebuild_kg,
         "add_text": rebuild_kg,
         "cognify": rebuild_kg,
+        "graph_metrics": rebuild_kg,
         "retriever": retrieve,
     }
 

Original file line number	Diff line number	Diff line change
`@@ -8,6 +8,7 @@`
`8`	`8`	`from cognee.modules.visualization.cognee_network_visualization import (`
`9`	`9`	`cognee_network_visualization,`
`10`	`10`	`)`
	`11`	`+from .modules.data.operations.get_pipeline_run_metrics import get_pipeline_run_metrics`
`11`	`12`
`12`	`13`	`# Pipelines`
`13`	`14`	`from .modules import pipelines`