diff --git a/.dlt/config.toml b/.dlt/config.toml deleted file mode 100644 index c72c145b5..000000000 --- a/.dlt/config.toml +++ /dev/null @@ -1,6 +0,0 @@ -# put your configuration values here - -[runtime] -log_level = "WARNING" # the system log level of dlt -# use the dlthub_telemetry setting to enable/disable anonymous usage data reporting, see https://dlthub.com/docs/telemetry -dlthub_telemetry = false diff --git a/README.md b/README.md index eeee1145e..1b0e864f4 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,7 @@ More on [use-cases](https://docs.cognee.ai/use-cases) and [evals](https://github Get started quickly with a Google Colab notebook , Deepnote notebook or starter repo + ## Contributing Your contributions are at the core of making this a true open source project. Any contributions you make are **greatly appreciated**. See [`CONTRIBUTING.md`](CONTRIBUTING.md) for more information. diff --git a/cognee-starter-kit/.env.template b/cognee-starter-kit/.env.template new file mode 100644 index 000000000..10fea5553 --- /dev/null +++ b/cognee-starter-kit/.env.template @@ -0,0 +1,19 @@ +# In case you choose to use OpenAI provider, just adjust the model and api_key. +LLM_API_KEY="" +LLM_MODEL="openai/gpt-4o-mini" +LLM_PROVIDER="openai" +# Not needed if you use OpenAI +LLM_ENDPOINT="" +LLM_API_VERSION="" + +# In case you choose to use OpenAI provider, just adjust the model and api_key. +EMBEDDING_API_KEY="" +EMBEDDING_MODEL="openai/text-embedding-3-large" +EMBEDDING_PROVIDER="openai" +# Not needed if you use OpenAI +EMBEDDING_ENDPOINT="" +EMBEDDING_API_VERSION="" + + +GRAPHISTRY_USERNAME="" +GRAPHISTRY_PASSWORD="" \ No newline at end of file diff --git a/cognee-starter-kit/.gitignore b/cognee-starter-kit/.gitignore new file mode 100644 index 000000000..c99e3a58e --- /dev/null +++ b/cognee-starter-kit/.gitignore @@ -0,0 +1,196 @@ +.data +.env +.local.env +.prod.env +cognee/.data/ + +code_pipeline_output*/ + +*.lance/ +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +full_run.ipynb + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Cognee logs directory - keep directory, ignore contents +logs/* +!logs/.gitkeep +!logs/README.md + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.env.local +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ + +.vscode/ +cognee/data/ +cognee/cache/ + +# Default cognee system directory, used in development +.cognee_system/ +.data_storage/ +.artifacts/ +.anon_id + +node_modules/ + +# Evals +SWE-bench_testsample/ + +# ChromaDB Data +.chromadb_data/ diff --git a/cognee-starter-kit/README.md b/cognee-starter-kit/README.md new file mode 100644 index 000000000..fab9142c1 --- /dev/null +++ b/cognee-starter-kit/README.md @@ -0,0 +1,98 @@ + +# Cognee Starter Kit +Welcome to the cognee Starter Repo! This repository is designed to help you get started quickly by providing a structured dataset and pre-built data pipelines using cognee to build powerful knowledge graphs. + +You can use this repo to ingest, process, and visualize data in minutes. + +By following this guide, you will: + +- Load structured company and employee data +- Utilize pre-built pipelines for data processing +- Perform graph-based search and query operations +- Visualize entity relationships effortlessly on a graph + +# How to Use This Repo 🛠 + +## Install uv if you don't have it on your system +``` +pip install uv +``` +## Install dependencies +``` +uv sync +``` + +## Setup LLM +Add environment variables to `.env` file. +In case you choose to use OpenAI provider, add just the model and api_key. +``` +LLM_PROVIDER="" +LLM_MODEL="" +LLM_ENDPOINT="" +LLM_API_KEY="" +LLM_API_VERSION="" + +EMBEDDING_PROVIDER="" +EMBEDDING_MODEL="" +EMBEDDING_ENDPOINT="" +EMBEDDING_API_KEY="" +EMBEDDING_API_VERSION="" +``` + +Activate the Python environment: +``` +source .venv/bin/activate +``` + +## Run the Default Pipeline + +This script runs the cognify pipeline with default settings. It ingests text data, builds a knowledge graph, and allows you to run search queries. + +``` +python src/pipelines/default.py +``` + +## Run the Low-Level Pipeline + +This script implements its own pipeline with custom ingestion task. It processes the given JSON data about companies and employees, making it searchable via a graph. + +``` +python src/pipelines/low_level.py +``` + +## Run the Custom Model Pipeline + +Custom model uses custom pydantic model for graph extraction. This script categorizes programming languages as an example and visualizes relationships. + +``` +python src/pipelines/custom-model.py +``` + +## Graph preview + +cognee provides a visualize_graph function that will render the graph for you. + +``` + graph_file_path = str( + pathlib.Path( + os.path.join(pathlib.Path(__file__).parent, ".artifacts/graph_visualization.html") + ).resolve() + ) + await visualize_graph(graph_file_path) +``` +If you want to use tools like Graphistry for graph visualization: +- create an account and API key from https://www.graphistry.com +- add the following environment variables to `.env` file: +``` +GRAPHISTRY_USERNAME="" +GRAPHISTRY_PASSWORD="" +``` +Note: `GRAPHISTRY_PASSWORD` is API key. + + +# What will you build with cognee? + +- Expand the dataset by adding more structured/unstructured data +- Customize the data model to fit your use case +- Use the search API to build an intelligent assistant +- Visualize knowledge graphs for better insights diff --git a/cognee-starter-kit/pyproject.toml b/cognee-starter-kit/pyproject.toml new file mode 100644 index 000000000..ba461b1b4 --- /dev/null +++ b/cognee-starter-kit/pyproject.toml @@ -0,0 +1,11 @@ +[project] +name = "cognee-starter" +version = "0.1.1" +description = "Starter project which can be harvested for parts" +readme = "README.md" + +requires-python = ">=3.10, <=3.13" + +dependencies = [ + "cognee>=0.1.38", +] diff --git a/cognee-starter-kit/src/data/companies.json b/cognee-starter-kit/src/data/companies.json new file mode 100644 index 000000000..c402ec802 --- /dev/null +++ b/cognee-starter-kit/src/data/companies.json @@ -0,0 +1,38 @@ +[ + { + "name": "TechNova Inc.", + "departments": [ + "Engineering", + "Marketing" + ] + }, + { + "name": "GreenFuture Solutions", + "departments": [ + "Research & Development", + "Sales", + "Customer Support" + ] + }, + { + "name": "Skyline Financials", + "departments": [ + "Accounting" + ] + }, + { + "name": "MediCare Plus", + "departments": [ + "Healthcare", + "Administration" + ] + }, + { + "name": "NextGen Robotics", + "departments": [ + "AI Development", + "Manufacturing", + "HR" + ] + } +] diff --git a/cognee-starter-kit/src/data/people.json b/cognee-starter-kit/src/data/people.json new file mode 100644 index 000000000..3474d8794 --- /dev/null +++ b/cognee-starter-kit/src/data/people.json @@ -0,0 +1,52 @@ +[ + { + "name": "John Doe", + "company": "TechNova Inc.", + "department": "Engineering" + }, + { + "name": "Jane Smith", + "company": "TechNova Inc.", + "department": "Marketing" + }, + { + "name": "Alice Johnson", + "company": "GreenFuture Solutions", + "department": "Sales" + }, + { + "name": "Bob Williams", + "company": "GreenFuture Solutions", + "department": "Customer Support" + }, + { + "name": "Michael Brown", + "company": "Skyline Financials", + "department": "Accounting" + }, + { + "name": "Emily Davis", + "company": "MediCare Plus", + "department": "Healthcare" + }, + { + "name": "David Wilson", + "company": "MediCare Plus", + "department": "Administration" + }, + { + "name": "Emma Thompson", + "company": "NextGen Robotics", + "department": "AI Development" + }, + { + "name": "Chris Martin", + "company": "NextGen Robotics", + "department": "Manufacturing" + }, + { + "name": "Sophia White", + "company": "NextGen Robotics", + "department": "HR" + } +] diff --git a/cognee-starter-kit/src/pipelines/custom-model.py b/cognee-starter-kit/src/pipelines/custom-model.py new file mode 100644 index 000000000..347758636 --- /dev/null +++ b/cognee-starter-kit/src/pipelines/custom-model.py @@ -0,0 +1,92 @@ +import os +import asyncio +import pathlib +from cognee import config, add, cognify, search, SearchType, prune, visualize_graph + +# from cognee.shared.utils import render_graph +from cognee.low_level import DataPoint + + +async def main(): + data_directory_path = str( + pathlib.Path(os.path.join(pathlib.Path(__file__).parent, ".data_storage")).resolve() + ) + # Set up the data directory. Cognee will store files here. + config.data_root_directory(data_directory_path) + + cognee_directory_path = str( + pathlib.Path(os.path.join(pathlib.Path(__file__).parent, ".cognee_system")).resolve() + ) + # Set up the Cognee system directory. Cognee will store system files and databases here. + config.system_root_directory(cognee_directory_path) + + # Prune data and system metadata before running, only if we want "fresh" state. + await prune.prune_data() + await prune.prune_system(metadata=True) + + text = "The Python programming language is widely used in data analysis, web development, and machine learning." + + # Add the text data to Cognee. + await add(text) + + # Define a custom graph model for programming languages. + class FieldType(DataPoint): + name: str = "Field" + + class Field(DataPoint): + name: str + is_type: FieldType + metadata: dict = {"index_fields": ["name"]} + + class ProgrammingLanguageType(DataPoint): + name: str = "Programming Language" + + class ProgrammingLanguage(DataPoint): + name: str + used_in: list[Field] = [] + is_type: ProgrammingLanguageType + metadata: dict = {"index_fields": ["name"]} + + # Cognify the text data. + await cognify(graph_model=ProgrammingLanguage) + + # # Get a graphistry url (Register for a free account at https://www.graphistry.com) + # url = await render_graph() + # print(f"Graphistry URL: {url}") + + # Or use our simple graph preview + graph_file_path = str( + pathlib.Path( + os.path.join(pathlib.Path(__file__).parent, ".artifacts/graph_visualization.html") + ).resolve() + ) + await visualize_graph(graph_file_path) + + # Completion query that uses graph data to form context. + graph_completion = await search( + query_text="What is python?", query_type=SearchType.GRAPH_COMPLETION + ) + print("Graph completion result is:") + print(graph_completion) + + # Completion query that uses document chunks to form context. + rag_completion = await search( + query_text="What is Python?", query_type=SearchType.RAG_COMPLETION + ) + print("Completion result is:") + print(rag_completion) + + # Query all summaries related to query. + summaries = await search(query_text="Python", query_type=SearchType.SUMMARIES) + print("Summary results are:") + for summary in summaries: + print(summary) + + chunks = await search(query_text="Python", query_type=SearchType.CHUNKS) + print("Chunk results are:") + for chunk in chunks: + print(chunk) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/cognee-starter-kit/src/pipelines/default.py b/cognee-starter-kit/src/pipelines/default.py new file mode 100644 index 000000000..4ac4bcbc4 --- /dev/null +++ b/cognee-starter-kit/src/pipelines/default.py @@ -0,0 +1,72 @@ +import os +import asyncio +import pathlib +from cognee import config, add, cognify, search, SearchType, prune, visualize_graph +# from cognee.shared.utils import render_graph + + +async def main(): + data_directory_path = str( + pathlib.Path(os.path.join(pathlib.Path(__file__).parent, ".data_storage")).resolve() + ) + # Set up the data directory. Cognee will store files here. + config.data_root_directory(data_directory_path) + + cognee_directory_path = str( + pathlib.Path(os.path.join(pathlib.Path(__file__).parent, ".cognee_system")).resolve() + ) + # Set up the Cognee system directory. Cognee will store system files and databases here. + config.system_root_directory(cognee_directory_path) + + # Prune data and system metadata before running, only if we want "fresh" state. + await prune.prune_data() + await prune.prune_system(metadata=True) + + text = "The Python programming language is widely used in data analysis, web development, and machine learning." + + # Add the text data to Cognee. + await add(text) + + # Cognify the text data. + await cognify() + + # # Get a graphistry url (Register for a free account at https://www.graphistry.com) + # url = await render_graph() + # print(f"Graphistry URL: {url}") + + # Or use our simple graph preview + graph_file_path = str( + pathlib.Path( + os.path.join(pathlib.Path(__file__).parent, ".artifacts/graph_visualization.html") + ).resolve() + ) + await visualize_graph(graph_file_path) + + # Completion query that uses graph data to form context. + graph_completion = await search( + query_text="What is python?", query_type=SearchType.GRAPH_COMPLETION + ) + print("Graph completion result is:") + print(graph_completion) + + # Completion query that uses document chunks to form context. + rag_completion = await search( + query_text="What is Python?", query_type=SearchType.RAG_COMPLETION + ) + print("Completion result is:") + print(rag_completion) + + # Query all summaries related to query. + summaries = await search(query_text="Python", query_type=SearchType.SUMMARIES) + print("Summary results are:") + for summary in summaries: + print(summary) + + chunks = await search(query_text="Python", query_type=SearchType.CHUNKS) + print("Chunk results are:") + for chunk in chunks: + print(chunk) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/cognee-starter-kit/src/pipelines/low_level.py b/cognee-starter-kit/src/pipelines/low_level.py new file mode 100644 index 000000000..4c4c9d6da --- /dev/null +++ b/cognee-starter-kit/src/pipelines/low_level.py @@ -0,0 +1,125 @@ +import os +import uuid +import json +import asyncio +import pathlib +from cognee import config, prune, search, SearchType, visualize_graph +from cognee.low_level import setup, DataPoint +from cognee.pipelines import run_tasks, Task +from cognee.tasks.storage import add_data_points +from cognee.tasks.storage.index_graph_edges import index_graph_edges +from cognee.modules.users.methods import get_default_user + + +class Person(DataPoint): + name: str + metadata: dict = {"index_fields": ["name"]} + + +class Department(DataPoint): + name: str + employees: list[Person] + metadata: dict = {"index_fields": ["name"]} + + +class CompanyType(DataPoint): + name: str = "Company" + + +class Company(DataPoint): + name: str + departments: list[Department] + is_type: CompanyType + metadata: dict = {"index_fields": ["name"]} + + +def ingest_files(): + companies_file_path = os.path.join(os.path.dirname(__file__), "../data/companies.json") + companies = json.loads(open(companies_file_path, "r").read()) + + people_file_path = os.path.join(os.path.dirname(__file__), "../data/people.json") + people = json.loads(open(people_file_path, "r").read()) + + people_data_points = {} + departments_data_points = {} + + for person in people: + new_person = Person(name=person["name"]) + people_data_points[person["name"]] = new_person + + if person["department"] not in departments_data_points: + departments_data_points[person["department"]] = Department( + name=person["department"], employees=[new_person] + ) + else: + departments_data_points[person["department"]].employees.append(new_person) + + companies_data_points = {} + + # Create a single CompanyType node, so we connect all companies to it. + companyType = CompanyType() + + for company in companies: + new_company = Company(name=company["name"], departments=[], is_type=companyType) + companies_data_points[company["name"]] = new_company + + for department_name in company["departments"]: + if department_name not in departments_data_points: + departments_data_points[department_name] = Department( + name=department_name, employees=[] + ) + + new_company.departments.append(departments_data_points[department_name]) + + return companies_data_points.values() + + +async def main(): + cognee_directory_path = str( + pathlib.Path(os.path.join(pathlib.Path(__file__).parent, ".cognee_system")).resolve() + ) + # Set up the Cognee system directory. Cognee will store system files and databases here. + config.system_root_directory(cognee_directory_path) + + # Prune system metadata before running, only if we want "fresh" state. + await prune.prune_system(metadata=True) + + await setup() + + # Generate a random dataset_id + dataset_id = uuid.uuid4() + user = await get_default_user() + + pipeline = run_tasks( + [ + Task(ingest_files), + Task(add_data_points), + ], + dataset_id, + None, + user, + "demo_pipeline", + ) + + async for status in pipeline: + print(status) + + await index_graph_edges() + + # Or use our simple graph preview + graph_file_path = str( + os.path.join(os.path.dirname(__file__), ".artifacts/graph_visualization.html") + ) + await visualize_graph(graph_file_path) + + # Completion query that uses graph data to form context. + completion = await search( + query_text="Who works for GreenFuture Solutions?", + query_type=SearchType.GRAPH_COMPLETION, + ) + print("Graph completion result is:") + print(completion) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/cognee/tests/test_starter_pipelines.py b/cognee/tests/test_starter_pipelines.py new file mode 100644 index 000000000..97e9d0881 --- /dev/null +++ b/cognee/tests/test_starter_pipelines.py @@ -0,0 +1,66 @@ +import unittest +import subprocess +import os +import sys + + +class TestPipelines(unittest.TestCase): + """Tests that all pipelines run successfully.""" + + def setUp(self): + # Ensure we're in the correct directory + self.project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) + self.pipelines_dir = os.path.join(self.project_root, "src", "pipelines") + + # Required environment variables + self.required_env_vars = ["LLM_API_KEY", "EMBEDDING_API_KEY"] + + # Check if required environment variables are set + missing_vars = [var for var in self.required_env_vars if not os.environ.get(var)] + if missing_vars: + self.skipTest(f"Missing required environment variables: {', '.join(missing_vars)}") + + def _run_pipeline(self, script_name): + """Helper method to run a pipeline script and return the result.""" + script_path = os.path.join(self.pipelines_dir, script_name) + + # Use the Python executable from the virtual environment + python_exe = os.path.join(self.project_root, ".venv", "bin", "python") + if not os.path.exists(python_exe): + python_exe = sys.executable + + try: + result = subprocess.run( + [python_exe, script_path], + check=True, + capture_output=True, + text=True, + timeout=300, # 5 minute timeout + ) + return result + except subprocess.CalledProcessError as e: + self.fail( + f"Pipeline {script_name} failed with code {e.returncode}. " + f"Stdout: {e.stdout}, Stderr: {e.stderr}" + ) + except subprocess.TimeoutExpired: + self.fail(f"Pipeline {script_name} timed out after 300 seconds") + + def test_default_pipeline(self): + """Test that the default pipeline runs successfully.""" + result = self._run_pipeline("default.py") + self.assertEqual(result.returncode, 0) + + def test_low_level_pipeline(self): + """Test that the low-level pipeline runs successfully.""" + result = self._run_pipeline("low_level.py") + self.assertEqual(result.returncode, 0) + + def test_custom_model_pipeline(self): + """Test that the custom model pipeline runs successfully.""" + result = self._run_pipeline("custom-model.py") + self.assertEqual(result.returncode, 0) + + +if __name__ == "__main__": + unittest.main()