topoteretes
diff --git a/‎cognee/infrastructure/llm/prompts/read_query_prompt.py
+5-2 b/‎cognee/infrastructure/llm/prompts/read_query_prompt.py
+5-2
diff --git a/‎cognee/infrastructure/llm/prompts/render_prompt.py
+3-2 b/‎cognee/infrastructure/llm/prompts/render_prompt.py
+3-2
diff --git a/‎cognee/tasks/experimental/__init__.py b/‎cognee/tasks/experimental/__init__.py
diff --git a/‎cognee/tasks/graph/cascade_extract/__init__.py b/‎cognee/tasks/graph/cascade_extract/__init__.py
diff --git a/‎cognee/tasks/graph/cascade_extract/prompts/extract_graph_edge_triplets_prompt_input.txt
+18 b/‎cognee/tasks/graph/cascade_extract/prompts/extract_graph_edge_triplets_prompt_input.txt
+18
diff --git a/‎cognee/tasks/graph/cascade_extract/prompts/extract_graph_edge_triplets_prompt_system.txt
+8 b/‎cognee/tasks/graph/cascade_extract/prompts/extract_graph_edge_triplets_prompt_system.txt
+8
diff --git a/‎cognee/tasks/graph/cascade_extract/prompts/extract_graph_nodes_prompt_input.txt
+7 b/‎cognee/tasks/graph/cascade_extract/prompts/extract_graph_nodes_prompt_input.txt
+7
diff --git a/‎cognee/tasks/graph/cascade_extract/prompts/extract_graph_nodes_prompt_system.txt
+8 b/‎cognee/tasks/graph/cascade_extract/prompts/extract_graph_nodes_prompt_system.txt
+8
diff --git a/‎cognee/tasks/graph/cascade_extract/prompts/extract_graph_relationship_names_prompt_input.txt
+15 b/‎cognee/tasks/graph/cascade_extract/prompts/extract_graph_relationship_names_prompt_input.txt
+15
diff --git a/‎cognee/tasks/graph/cascade_extract/prompts/extract_graph_relationship_names_prompt_system.txt
+6 b/‎cognee/tasks/graph/cascade_extract/prompts/extract_graph_relationship_names_prompt_system.txt
+6
diff --git a/‎cognee/tasks/graph/cascade_extract/utils/__init__.py b/‎cognee/tasks/graph/cascade_extract/utils/__init__.py
diff --git a/‎cognee/tasks/graph/cascade_extract/utils/extract_content_nodes_and_relationship_names.py
+61 b/‎cognee/tasks/graph/cascade_extract/utils/extract_content_nodes_and_relationship_names.py
+61
diff --git a/‎cognee/tasks/graph/cascade_extract/utils/extract_edge_triplets.py
+60 b/‎cognee/tasks/graph/cascade_extract/utils/extract_edge_triplets.py
+60
diff --git a/‎cognee/tasks/graph/cascade_extract/utils/extract_nodes.py
+45 b/‎cognee/tasks/graph/cascade_extract/utils/extract_nodes.py
+45
diff --git a/‎cognee/tasks/graph/extract_graph_from_data.py
+13-9 b/‎cognee/tasks/graph/extract_graph_from_data.py
+13-9
diff --git a/‎cognee/tasks/graph/extract_graph_from_data_v2.py
+40 b/‎cognee/tasks/graph/extract_graph_from_data_v2.py
+40
diff --git a/‎evals/eval_framework/corpus_builder/corpus_builder_executor.py
+8-12 b/‎evals/eval_framework/corpus_builder/corpus_builder_executor.py
+8-12
@@ -3,10 +3,13 @@
 from cognee.root_dir import get_absolute_path
 
 
-def read_query_prompt(prompt_file_name: str):
+def read_query_prompt(prompt_file_name: str, base_directory: str = None):
     """Read a query prompt from a file."""
     try:
-        file_path = path.join(get_absolute_path("./infrastructure/llm/prompts"), prompt_file_name)
+        if base_directory is None:
+            base_directory = get_absolute_path("./infrastructure/llm/prompts")
+
+        file_path = path.join(base_directory, prompt_file_name)
 
         with open(file_path, "r", encoding="utf-8") as file:
             return file.read()
 
@@ -2,14 +2,15 @@
 from cognee.root_dir import get_absolute_path
 
 
-def render_prompt(filename: str, context: dict) -> str:
+def render_prompt(filename: str, context: dict, base_directory: str = None) -> str:
     """Render a Jinja2 template asynchronously.
     :param filename: The name of the template file to render.
     :param context: The context to render the template with.
     :return: The rendered template as a string."""
 
     # Set the base directory relative to the cognee root directory
-    base_directory = get_absolute_path("./infrastructure/llm/prompts")
+    if base_directory is None:
+        base_directory = get_absolute_path("./infrastructure/llm/prompts")
 
     # Initialize the Jinja2 environment to load templates from the filesystem
     env = Environment(
 
@@ -0,0 +1,18 @@
+Using provided potential nodes and relationships, extract concrete edges from the following text. Build upon previously extracted nodes and edges (if any), as this is round {{ round_number }} of {{ total_rounds }}.
+
+**Text:**
+{{ text }}
+
+**Potential Nodes to Use:**
+{{ potential_nodes }}
+
+**Potential Relationships to Use:**
+{{ potential_relationship_names }}
+
+**Previously Extracted Nodes:**
+{{ previous_nodes }}
+
+**Previously Extracted Edge Triplets:**
+{{ previous_edge_triplets }}
+
+Create specific edge triplets between nodes, ensuring each connection is clearly supported by the text content. Use the potential nodes and relationships as your primary building blocks, while considering previously extracted nodes and edges for consistency and completeness.
@@ -0,0 +1,8 @@
+You are an expert in knowledge graph building focusing on the extraction of graph triplets.
+ Your task is to extract structured knowledge graph triplets from text, using as a reference provided list of potential nodes and relationship names.
+	•	Form triplets in the format (start_node, relationship_name, end_node), selecting the most precise and relevant relationship.
+	•	Identify explicit and implied relationships by leveraging the given nodes and relationship names, as well as logical inference.
+	•	Ensure completeness by cross-checking all nodes and relationships across multiple rounds.
+	•	Exclude trivial, redundant, or nonsensical triplets, keeping only meaningful and well-structured connections.
+	•	Add relevant edge triplets beyond the available potential nodes and relationship names.
+	•	Return a list of extracted triplets, ensuring clarity and accuracy for knowledge graph integration.
@@ -0,0 +1,7 @@
+Extract distinct entities and concepts from the following text to expand the knowledge graph. Build upon previously extracted entities, ensuring completeness and consistency. This is round {{ round_number }} of {{ total_rounds }}.
+
+**Text:**
+{{ text }}
+
+**Previously Extracted Entities:**
+{{ previous_entities }}
@@ -0,0 +1,8 @@
+You are an expert in entity extraction and knowledge graph building focusing on the node identification.
+Your task is to perform a detailed entity and concept extraction from text to generate a list of potential nodes for a knowledge graph.
+	•	Extract clear, distinct entities and concepts as individual strings.
+	•	Be exhaustive, ensure completeness by capturing all the entities, names, nouns, noun-parts, and implied or implicit mentions.
+	•	Also extract potential entity type nodes, directly mentioned or implied.
+	•	Avoid duplicates and overly generic terms.
+	•	Consider different perspectives and indirect references.
+	•	Return only a list of unique node strings with all the entities.
@@ -0,0 +1,15 @@
+Analyze the following text to identify relationships between entities in the knowledge graph. This is round {{ round_number }} of {{ total_rounds }}.
+
+**Text:**
+{{ text }}
+
+**Previously Extracted Potential Nodes:**
+{{ potential_nodes }}
+
+**Nodes Identified in Previous Rounds:**
+{{ previous_nodes }}
+
+**Relationships Identified in Previous Rounds:**
+{{ previous_relationship_names }}
+
+Extract both explicit and implicit relationships between the nodes, building upon previous findings while ensuring completeness and consistency.
@@ -0,0 +1,6 @@
+You are an expert in relationship identification and knowledge graph building focusing on relationships. Your task is to perform a detailed extraction of relationship names from the text.
+	•	Extract all relationship names from explicit phrases, verbs, and implied context that could help form edge triplets.
+	•	Use the potential nodes and reassign them to relationship names if they correspond to a relation, verb, action or similar.
+	•	Ensure completeness by working in multiple rounds, capturing overlooked connections and refining the nodes list.
+	•	Focus on meaningful entities and relationship, directly stated or implied and implicit.
+	•	Return two lists: refined nodes and potential relationship names (for forming edges).
@@ -0,0 +1,61 @@
+from typing import List, Tuple
+from pydantic import BaseModel
+
+from cognee.infrastructure.llm.get_llm_client import get_llm_client
+from cognee.infrastructure.llm.prompts import render_prompt, read_query_prompt
+from cognee.root_dir import get_absolute_path
+
+
+class PotentialNodesAndRelationshipNames(BaseModel):
+    """Response model containing lists of potential node names and relationship names."""
+
+    nodes: List[str]
+    relationship_names: List[str]
+
+
+async def extract_content_nodes_and_relationship_names(
+    content: str, existing_nodes: List[str], n_rounds: int = 2
+) -> Tuple[List[str], List[str]]:
+    """Extracts node names and relationship_names from content through multiple rounds of analysis."""
+    llm_client = get_llm_client()
+    all_nodes: List[str] = existing_nodes.copy()
+    all_relationship_names: List[str] = []
+    existing_node_set = {node.lower() for node in all_nodes}
+    existing_relationship_names = set()
+
+    for round_num in range(n_rounds):
+        context = {
+            "text": content,
+            "potential_nodes": existing_nodes,
+            "previous_nodes": all_nodes,
+            "previous_relationship_names": all_relationship_names,
+            "round_number": round_num + 1,
+            "total_rounds": n_rounds,
+        }
+
+        base_directory = get_absolute_path("./tasks/graph/cascade_extract/prompts")
+        text_input = render_prompt(
+            "extract_graph_relationship_names_prompt_input.txt",
+            context,
+            base_directory=base_directory,
+        )
+        system_prompt = read_query_prompt(
+            "extract_graph_relationship_names_prompt_system.txt", base_directory=base_directory
+        )
+        response = await llm_client.acreate_structured_output(
+            text_input=text_input,
+            system_prompt=system_prompt,
+            response_model=PotentialNodesAndRelationshipNames,
+        )
+
+        for node in response.nodes:
+            if node.lower() not in existing_node_set:
+                all_nodes.append(node)
+                existing_node_set.add(node.lower())
+
+        for relationship_name in response.relationship_names:
+            if relationship_name.lower() not in existing_relationship_names:
+                all_relationship_names.append(relationship_name)
+                existing_relationship_names.add(relationship_name.lower())
+
+    return all_nodes, all_relationship_names
@@ -0,0 +1,60 @@
+from typing import List, Tuple
+from cognee.infrastructure.llm.get_llm_client import get_llm_client
+from cognee.infrastructure.llm.prompts import render_prompt, read_query_prompt
+from cognee.shared.data_models import KnowledgeGraph
+from cognee.root_dir import get_absolute_path
+
+
+async def extract_edge_triplets(
+    content: str, nodes: List[str], relationship_names: List[str], n_rounds: int = 2
+) -> KnowledgeGraph:
+    """Creates a knowledge graph by identifying relationships between the provided nodes."""
+    llm_client = get_llm_client()
+    final_graph = KnowledgeGraph(nodes=[], edges=[])
+    existing_nodes = set()
+    existing_node_ids = set()
+    existing_edge_triplets = set()
+
+    for round_num in range(n_rounds):
+        context = {
+            "text": content,
+            "potential_nodes": nodes,
+            "potential_relationship_names": relationship_names,
+            "previous_nodes": existing_nodes,
+            "previous_edge_triplets": existing_edge_triplets,
+            "round_number": round_num + 1,
+            "total_rounds": n_rounds,
+        }
+
+        base_directory = get_absolute_path("./tasks/graph/cascade_extract/prompts")
+        text_input = render_prompt(
+            "extract_graph_edge_triplets_prompt_input.txt", context, base_directory=base_directory
+        )
+        system_prompt = read_query_prompt(
+            "extract_graph_edge_triplets_prompt_system.txt", base_directory=base_directory
+        )
+        extracted_graph = await llm_client.acreate_structured_output(
+            text_input=text_input, system_prompt=system_prompt, response_model=KnowledgeGraph
+        )
+
+        for node in extracted_graph.nodes:
+            if node.name not in existing_nodes:
+                final_graph.nodes.append(node)
+                existing_nodes.add(node.name)
+                existing_node_ids.add(node.id)
+
+        for edge in extracted_graph.edges:
+            edge_key = (edge.source_node_id, edge.target_node_id, edge.relationship_name)
+            if edge_key in existing_edge_triplets:
+                continue
+
+            if not (
+                edge.source_node_id in existing_node_ids
+                and edge.target_node_id in existing_node_ids
+            ):
+                continue
+
+            final_graph.edges.append(edge)
+            existing_edge_triplets.add(edge_key)
+
+    return final_graph
@@ -0,0 +1,45 @@
+from typing import List
+from pydantic import BaseModel
+
+from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
+from cognee.infrastructure.llm.get_llm_client import get_llm_client
+from cognee.infrastructure.llm.prompts import render_prompt, read_query_prompt
+from cognee.root_dir import get_absolute_path
+
+
+class PotentialNodes(BaseModel):
+    """Response model containing a list of potential node names."""
+
+    nodes: List[str]
+
+
+async def extract_nodes(text: str, n_rounds: int = 2) -> List[str]:
+    """Extracts node names from content through multiple rounds of analysis."""
+    llm_client = get_llm_client()
+    all_nodes: List[str] = []
+    existing_nodes = set()
+
+    for round_num in range(n_rounds):
+        context = {
+            "previous_nodes": all_nodes,
+            "round_number": round_num + 1,
+            "total_rounds": n_rounds,
+            "text": text,
+        }
+        base_directory = get_absolute_path("./tasks/graph/cascade_extract/prompts")
+        text_input = render_prompt(
+            "extract_graph_nodes_prompt_input.txt", context, base_directory=base_directory
+        )
+        system_prompt = read_query_prompt(
+            "extract_graph_nodes_prompt_system.txt", base_directory=base_directory
+        )
+        response = await llm_client.acreate_structured_output(
+            text_input=text_input, system_prompt=system_prompt, response_model=PotentialNodes
+        )
+
+        for node in response.nodes:
+            if node.lower() not in existing_nodes:
+                all_nodes.append(node)
+                existing_nodes.add(node.lower())
+
+    return all_nodes
@@ -14,16 +14,10 @@
 from cognee.tasks.storage import add_data_points
 
 
-async def extract_graph_from_data(
-    data_chunks: list[DocumentChunk], graph_model: Type[BaseModel]
+async def integrate_chunk_graphs(
+    data_chunks: list[DocumentChunk], chunk_graphs: list, graph_model: Type[BaseModel]
 ) -> List[DocumentChunk]:
-    """
-    Extracts and integrates a knowledge graph from the text content of document chunks using a specified graph model.
-    """
-
-    chunk_graphs = await asyncio.gather(
-        *[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks]
-    )
+    """Updates DocumentChunk objects, integrates data points and edges into databases."""
     graph_engine = await get_graph_engine()
 
     if graph_model is not KnowledgeGraph:
@@ -52,3 +46,13 @@ async def extract_graph_from_data(
         await graph_engine.add_edges(graph_edges)
 
     return data_chunks
+
+
+async def extract_graph_from_data(
+    data_chunks: list[DocumentChunk], graph_model: Type[BaseModel]
+) -> List[DocumentChunk]:
+    """Extracts and integrates a knowledge graph from the text content of document chunks using a specified graph model."""
+    chunk_graphs = await asyncio.gather(
+        *[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks]
+    )
+    return await integrate_chunk_graphs(data_chunks, chunk_graphs, graph_model)
@@ -0,0 +1,40 @@
+import asyncio
+from typing import List
+
+from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
+from cognee.shared.data_models import KnowledgeGraph
+from cognee.tasks.graph.cascade_extract.utils.extract_nodes import extract_nodes
+from cognee.tasks.graph.cascade_extract.utils.extract_content_nodes_and_relationship_names import (
+    extract_content_nodes_and_relationship_names,
+)
+from cognee.tasks.graph.cascade_extract.utils.extract_edge_triplets import (
+    extract_edge_triplets,
+)
+from cognee.tasks.graph.extract_graph_from_data import integrate_chunk_graphs
+
+
+async def extract_graph_from_data(
+    data_chunks: List[DocumentChunk], n_rounds: int = 2
+) -> List[DocumentChunk]:
+    """Extract and update graph data from document chunks in multiple steps."""
+    chunk_nodes = await asyncio.gather(
+        *[extract_nodes(chunk.text, n_rounds) for chunk in data_chunks]
+    )
+
+    chunk_results = await asyncio.gather(
+        *[
+            extract_content_nodes_and_relationship_names(chunk.text, nodes, n_rounds)
+            for chunk, nodes in zip(data_chunks, chunk_nodes)
+        ]
+    )
+
+    updated_nodes, relationships = zip(*chunk_results)
+
+    chunk_graphs = await asyncio.gather(
+        *[
+            extract_edge_triplets(chunk.text, nodes, rels, n_rounds)
+            for chunk, nodes, rels in zip(data_chunks, updated_nodes, relationships)
+        ]
+    )
+
+    return await integrate_chunk_graphs(data_chunks, chunk_graphs, KnowledgeGraph)
@@ -1,16 +1,18 @@
 import cognee
 import logging
-from typing import Optional, Tuple, List, Dict, Union, Any
+from typing import Optional, Tuple, List, Dict, Union, Any, Callable, Awaitable
 
 from evals.eval_framework.benchmark_adapters.benchmark_adapters import BenchmarkAdapter
-from evals.eval_framework.corpus_builder.task_getters.task_getters import TaskGetters
-from evals.eval_framework.corpus_builder.task_getters.base_task_getter import BaseTaskGetter
+from evals.eval_framework.corpus_builder.task_getters.TaskGetters import TaskGetters
+from cognee.modules.pipelines.tasks.Task import Task
 from cognee.shared.utils import setup_logging
 
 
 class CorpusBuilderExecutor:
     def __init__(
-        self, benchmark: Union[str, Any] = "Dummy", task_getter_type: str = "DEFAULT"
+        self,
+        benchmark: Union[str, Any] = "Dummy",
+        task_getter: Callable[..., Awaitable[List[Task]]] = None,
     ) -> None:
         if isinstance(benchmark, str):
             try:
@@ -23,13 +25,7 @@ def __init__(
 
         self.raw_corpus = None
         self.questions = None
-
-        try:
-            task_enum = TaskGetters(task_getter_type)
-        except KeyError:
-            raise ValueError(f"Invalid task getter type: {task_getter_type}")
-
-        self.task_getter: BaseTaskGetter = task_enum.getter_class()
+        self.task_getter = task_getter
 
     def load_corpus(self, limit: Optional[int] = None) -> Tuple[List[Dict], List[str]]:
         self.raw_corpus, self.questions = self.adapter.load_corpus(limit=limit)
@@ -48,5 +44,5 @@ async def run_cognee(self) -> None:
 
         await cognee.add(self.raw_corpus)
 
-        tasks = await self.task_getter.get_tasks()
+        tasks = await self.task_getter()
         await cognee.cognify(tasks=tasks)