Merge pull request #943 from ScrapeGraphAI/pre/beta

VinciGit00 · web-flow · commit c4cca9a4ad42 · 2025-03-09T15:09:21.000+01:00
Pre/beta
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,15 @@
+## [1.41.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.40.1...v1.41.0-beta.1) (2025-03-07)
+
+
+### Features
+
+* add CLoD integration ([4e0e785](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/4e0e78582c3a75e64c5eba26ce40b5ffbf05d58e))
+
+
+### Test
+
+* Add coverage improvement test for tests/test_generate_answer_node.py ([6769c0d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/6769c0d43ab72f1c8b520dd28d19f747b22f9b7c))
+
 ## [1.40.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.40.0...v1.40.1) (2025-02-27)
 
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "scrapegraphai"
 
-version = "1.40.1"
+version = "1.41.0b1"
 
 
 
diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py
@@ -13,7 +13,7 @@
 from pydantic import BaseModel
 
 from ..helpers import models_tokens
-from ..models import DeepSeek, OneApi
+from ..models import CLoD, DeepSeek, OneApi
 from ..utils.logging import set_verbosity_info, set_verbosity_warning
 
 
@@ -164,6 +164,7 @@ def _create_llm(self, llm_config: dict) -> object:
             "deepseek",
             "ernie",
             "fireworks",
+            "clod",
             "togetherai",
         }
 
@@ -218,6 +219,7 @@ def _create_llm(self, llm_config: dict) -> object:
                 "ernie",
                 "deepseek",
                 "togetherai",
+                "clod",
             }:
                 if llm_params["model_provider"] == "bedrock":
                     llm_params["model_kwargs"] = {
@@ -229,6 +231,9 @@ def _create_llm(self, llm_config: dict) -> object:
             else:
                 model_provider = llm_params.pop("model_provider")
 
+                if model_provider == "clod":
+                    return CLoD(**llm_params)
+
                 if model_provider == "deepseek":
                     return DeepSeek(**llm_params)
 
diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py
@@ -261,5 +261,37 @@
         "mixtral-moe-8x22B-instruct": 65536,
         "mixtral-moe-8x7B-instruct": 65536,
     },
+    "clod": {
+        "open-mistral-7b": 32000,
+        "Llama-3.1-70b": 128000,
+        "Llama-3.1-405b": 128000,
+        "Llama-3.3-70b": 128000,
+        "Llama-3.1-8b": 128000,
+        "gpt-4o": 128000,
+        "gpt-4o-mini": 128000,
+        "gpt-4-turbo": 128000,
+        "claude-3-opus-latest": 200000,
+        "gemini-1.5-flash-8b": 128000,
+        "gemini-1.5-flash": 128000,
+        "open-mixtral-8x7b": 32000,
+        "open-mixtral-8x22b": 64000,
+        "claude-3-5-sonnet-latest": 200000,
+        "claude-3-haiku-20240307": 200000,
+        "Qwen-2.5-Coder-32B": 32000,
+        "Deepseek-R1-Distill-Llama-70B": 131072,
+        "Deepseek-V3": 128000,
+        "Qwen-2-VL-72B": 128000,
+        "Deepseek-R1-Distill-Qwen-14B": 131072,
+        "Deepseek-R1-Distill-Qwen-1.5B": 131072,
+        "Deepseek-R1": 128000,
+        "Deepseek-Llm-Chat-67B": 4096,
+        "Qwen-2.5-7B": 132072,
+        "Qwen-2.5-72B": 132072,
+        "Qwen-2-72B": 128000,
+        "o1": 200000,
+        "gemini-2.0-flash-exp": 1000000,
+        "grok-beta": 128000,
+        "grok-2-latest": 128000,
+    },
     "togetherai": {"Meta-Llama-3.1-70B-Instruct-Turbo": 128000},
 }
diff --git a/scrapegraphai/models/__init__.py b/scrapegraphai/models/__init__.py
@@ -2,14 +2,10 @@
 This module contains the model definitions used in the ScrapeGraphAI application.
 """
 
+from .clod import CLoD
 from .deepseek import DeepSeek
 from .oneapi import OneApi
 from .openai_itt import OpenAIImageToText
 from .openai_tts import OpenAITextToSpeech
 
-__all__ = [
-    "DeepSeek",
-    "OneApi",
-    "OpenAIImageToText",
-    "OpenAITextToSpeech",
-]
+__all__ = ["DeepSeek", "OneApi", "OpenAIImageToText", "OpenAITextToSpeech", "CLoD"]
diff --git a/scrapegraphai/models/clod.py b/scrapegraphai/models/clod.py
@@ -0,0 +1,23 @@
+"""
+CLōD Module
+"""
+
+from langchain_openai import ChatOpenAI
+
+
+class CLoD(ChatOpenAI):
+    """
+    A wrapper for the ChatOpenAI class (CLōD uses an OpenAI-like API) that
+    provides default configuration and could be extended with additional methods
+    if needed.
+
+    Args:
+        llm_config (dict): Configuration parameters for the language model.
+    """
+
+    def __init__(self, **llm_config):
+        if "api_key" in llm_config:
+            llm_config["openai_api_key"] = llm_config.pop("api_key")
+        llm_config["openai_api_base"] = "https://api.clod.io/v1"
+
+        super().__init__(**llm_config)
diff --git a/tests/graphs/.env.example b/tests/graphs/.env.example
@@ -1,2 +1,3 @@
 OPENAI_API_KEY="YOUR OPENAI API KEY"
 FIREWORKS_APIKEY="YOOUR FIREWORK KEY"
+CLOD_API_KEY="YOUR CLOD API KEY"
diff --git a/tests/graphs/abstract_graph_test.py b/tests/graphs/abstract_graph_test.py
@@ -199,4 +199,37 @@ def test_set_common_params(self):
         test_params = {"param1": "value1", "param2": "value2"}
         graph.set_common_params(test_params)
 
-        # Assert that update_config was called on each node with the correct parameters
+        # Assert that update_config was called on each node with the correct parameters
+    
+    def test_get_state(self):
+        """Test that get_state returns the correct final state with or without a provided key, and raises KeyError for missing keys."""
+        graph = TestGraph("dummy", {"llm": {"model": "openai/gpt-3.5-turbo", "openai_api_key": "sk-test"}})
+        # Set a dummy final state
+        graph.final_state = {"answer": "42", "other": "value"}
+        # Test without a key returns the entire final_state
+        state = graph.get_state()
+        assert state == {"answer": "42", "other": "value"}
+        # Test with a valid key returns the specific value
+        answer = graph.get_state("answer")
+        assert answer == "42"
+        # Test that a missing key raises a KeyError
+        with pytest.raises(KeyError):
+            _ = graph.get_state("nonexistent")
+
+    def test_append_node(self):
+        """Test that append_node correctly delegates to the graph's append_node method."""
+        graph = TestGraph("dummy", {"llm": {"model": "openai/gpt-3.5-turbo", "openai_api_key": "sk-test"}})
+        # Replace the graph object with a mock that has append_node
+        mock_graph = Mock()
+        graph.graph = mock_graph
+        dummy_node = Mock()
+        graph.append_node(dummy_node)
+        mock_graph.append_node.assert_called_once_with(dummy_node)
+
+    def test_get_execution_info(self):
+        """Test that get_execution_info returns the execution info stored in the graph."""
+        graph = TestGraph("dummy", {"llm": {"model": "openai/gpt-3.5-turbo", "openai_api_key": "sk-test"}})
+        dummy_info = {"execution": "info", "status": "ok"}
+        graph.execution_info = dummy_info
+        info = graph.get_execution_info()
+        assert info == dummy_info
diff --git a/tests/graphs/smart_scraper_clod_test.py b/tests/graphs/smart_scraper_clod_test.py
@@ -0,0 +1,55 @@
+"""
+Module for testing the smart scraper class
+"""
+
+import os
+
+import pytest
+from dotenv import load_dotenv
+
+from scrapegraphai.graphs import SmartScraperGraph
+
+load_dotenv()
+
+
+@pytest.fixture
+def graph_config():
+    """Configuration of the graph"""
+    clod_api_key = os.getenv("CLOD_API_KEY")
+    return {
+        "llm": {
+            "api_key": clod_api_key,
+            "model": "clod/claude-3-5-sonnet-latest",
+        },
+        "verbose": True,
+        "headless": False,
+    }
+
+
+def test_scraping_pipeline(graph_config):
+    """Start of the scraping pipeline"""
+    smart_scraper_graph = SmartScraperGraph(
+        prompt="List me all the projects with their description.",
+        source="https://perinim.github.io/projects/",
+        config=graph_config,
+    )
+
+    result = smart_scraper_graph.run()
+
+    assert result is not None
+    assert isinstance(result, dict)
+
+
+def test_get_execution_info(graph_config):
+    """Get the execution info"""
+    smart_scraper_graph = SmartScraperGraph(
+        prompt="List me all the projects with their description.",
+        source="https://perinim.github.io/projects/",
+        config=graph_config,
+    )
+
+    smart_scraper_graph.run()
+
+    graph_exec_info = smart_scraper_graph.get_execution_info()
+
+    assert graph_exec_info is not None
diff --git a/tests/test_models_tokens.py b/tests/test_models_tokens.py
@@ -0,0 +1,148 @@
+import pytest
+from scrapegraphai.helpers.models_tokens import models_tokens
+
+class TestModelsTokens:
+    """Test suite for verifying the models_tokens dictionary content and structure."""
+
+    def test_openai_tokens(self):
+        """Test that the 'openai' provider exists and its tokens are valid positive integers."""
+        openai_models = models_tokens.get("openai")
+        assert openai_models is not None, "'openai' key should be present in models_tokens"
+        for model, token in openai_models.items():
+            assert isinstance(model, str), "Model name should be a string"
+            assert isinstance(token, int), "Token limit should be an integer"
+            assert token > 0, "Token limit should be positive"
+
+    def test_azure_openai_tokens(self):
+        """Test that the 'azure_openai' provider exists and its tokens are valid."""
+        azure_models = models_tokens.get("azure_openai")
+        assert azure_models is not None, "'azure_openai' key should be present"
+        for model, token in azure_models.items():
+            assert isinstance(model, str), "Model name should be a string"
+            assert isinstance(token, int), "Token limit should be an integer"
+
+    def test_google_providers(self):
+        """Test that Google provider dictionaries ('google_genai' and 'google_vertexai') contain expected entries."""
+        google_genai = models_tokens.get("google_genai")
+        google_vertexai = models_tokens.get("google_vertexai")
+        assert google_genai is not None, "'google_genai' key should be present"
+        assert google_vertexai is not None, "'google_vertexai' key should be present"
+        # Check a specific key from google_genai
+        assert "gemini-pro" in google_genai, "'gemini-pro' should be in google_genai models"
+        # Validate token values types
+        for provider in [google_genai, google_vertexai]:
+            for token in provider.values():
+                assert isinstance(token, int), "Token limit must be an integer"
+
+    def test_non_existent_provider(self):
+        """Test that a non-existent provider returns None."""
+        assert models_tokens.get("non_existent") is None, "Non-existent provider should return None"
+
+    def test_total_model_keys(self):
+        """Test that the total number of models across all providers is above an expected count."""
+        total_keys = sum(len(details) for details in models_tokens.values())
+        assert total_keys > 20, "Expected more than 20 total model tokens defined"
+
+    def test_specific_token_value(self):
+        """Test specific expected token value for a known model."""
+        openai = models_tokens.get("openai")
+        # Verify that the token limit for "gpt-4" is 8192 as defined
+        assert openai.get("gpt-4") == 8192, "Expected token limit for gpt-4 to be 8192"
+
+    def test_non_empty_model_keys(self):
+        """Ensure that model token names are non-empty strings."""
+        for provider, model_dict in models_tokens.items():
+            for model in model_dict.keys():
+                assert model != "", f"Model name in provider '{provider}' should not be empty."
+
+    def test_token_limits_range(self):
+        """Test that token limits for all models fall within a plausible range (e.g., 1 to 300000)."""
+        for provider, model_dict in models_tokens.items():
+            for model, token in model_dict.items():
+                assert 1 <= token <= 1100000, f"Token limit for {model} in provider {provider} is out of plausible range."
+    def test_provider_structure(self):
+        """Test that every provider in models_tokens has a dictionary as its value."""
+        for provider, models in models_tokens.items():
+            assert isinstance(models, dict), f"Provider {provider} should map to a dictionary, got {type(models).__name__}"
+
+    def test_non_empty_provider(self):
+        """Test that each provider dictionary is not empty."""
+        for provider, models in models_tokens.items():
+            assert len(models) > 0, f"Provider {provider} should contain at least one model."
+
+    def test_specific_model_token_values(self):
+        """Test specific expected token values for selected models from various providers."""
+        # Verify a token for a selected model from the 'openai' provider
+        openai = models_tokens.get("openai")
+        assert openai.get("gpt-3.5-turbo-0125") == 16385, "Expected token limit for gpt-3.5-turbo-0125 in openai to be 16385"
+
+        # Verify a token for a selected model from the 'azure_openai' provider
+        azure = models_tokens.get("azure_openai")
+        assert azure.get("gpt-3.5") == 4096, "Expected token limit for gpt-3.5 in azure_openai to be 4096"
+
+        # Verify a token for a selected model from the 'anthropic' provider
+        anthropic = models_tokens.get("anthropic")
+        assert anthropic.get("claude_instant") == 100000, "Expected token limit for claude_instant in anthropic to be 100000"
+
+    def test_providers_count(self):
+        """Test that the total number of providers is as expected (at least 15)."""
+        assert len(models_tokens) >= 15, "Expected at least 15 providers in models_tokens"
+
+    def test_non_existent_model(self):
+        """Test that a non-existent model within a valid provider returns None."""
+        openai = models_tokens.get("openai")
+        assert openai.get("non_existent_model") is None, "Non-existent model should return None from a valid provider."
+    def test_no_whitespace_in_model_names(self):
+        """Test that model names do not contain leading or trailing whitespace."""
+        for provider, model_dict in models_tokens.items():
+            for model in model_dict.keys():
+                # Assert that stripping whitespace does not change the model name
+                assert model == model.strip(), f"Model name '{model}' in provider '{provider}' contains leading or trailing whitespace."
+
+    def test_specific_models_additional(self):
+        """Test specific token values for additional models across various providers."""
+        # Check some models in the 'ollama' provider
+        ollama = models_tokens.get("ollama")
+        assert ollama.get("llama2") == 4096, "Expected token limit for 'llama2' in ollama to be 4096"
+        assert ollama.get("llama2:70b") == 4096, "Expected token limit for 'llama2:70b' in ollama to be 4096"
+
+        # Check a specific model from the 'mistralai' provider
+        mistralai = models_tokens.get("mistralai")
+        assert mistralai.get("open-codestral-mamba") == 256000, "Expected token limit for 'open-codestral-mamba' in mistralai to be 256000"
+
+        # Check a specific model from the 'deepseek' provider
+        deepseek = models_tokens.get("deepseek")
+        assert deepseek.get("deepseek-chat") == 28672, "Expected token limit for 'deepseek-chat' in deepseek to be 28672"
+
+        # Check a model from the 'ernie' provider
+        ernie = models_tokens.get("ernie")
+        assert ernie.get("ernie-bot") == 4096, "Expected token limit for 'ernie-bot' in ernie to be 4096"
+    
+    def test_nvidia_specific(self):
+        """Test specific token value for 'meta/codellama-70b' in the nvidia provider."""
+        nvidia = models_tokens.get("nvidia")
+        assert nvidia is not None, "'nvidia' provider should exist"
+        # Verify token for 'meta/codellama-70b' equals 16384 as defined in the nvidia dictionary
+        assert nvidia.get("meta/codellama-70b") == 16384, "Expected token limit for 'meta/codellama-70b' in nvidia to be 16384"
+
+    def test_groq_specific(self):
+        """Test specific token value for 'claude-3-haiku-20240307\'' in the groq provider."""
+        groq = models_tokens.get("groq")
+        assert groq is not None, "'groq' provider should exist"
+        # Note: The model name has an embedded apostrophe at the end in its name.
+        assert groq.get("claude-3-haiku-20240307'") == 8192, "Expected token limit for 'claude-3-haiku-20240307\\'' in groq to be 8192"
+
+    def test_togetherai_specific(self):
+        """Test specific token value for 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo' in the toghetherai provider."""
+        togetherai = models_tokens.get("toghetherai")
+        assert togetherai is not None, "'toghetherai' provider should exist"
+        expected = 128000
+        model_name = "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"
+        assert togetherai.get(model_name) == expected, f"Expected token limit for '{model_name}' in toghetherai to be {expected}"
+
+    def test_ernie_all_values(self):
+        """Test that all models in the 'ernie' provider have token values exactly 4096."""
+        ernie = models_tokens.get("ernie")
+        assert ernie is not None, "'ernie' provider should exist"
+        for model, token in ernie.items():
+            assert token == 4096, f"Expected token limit for '{model}' in ernie to be 4096, got {token}"
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`OPENAI_API_KEY="YOUR OPENAI API KEY"`
`2`	`2`	`FIREWORKS_APIKEY="YOOUR FIREWORK KEY"`
	`3`	`+CLOD_API_KEY="YOUR CLOD API KEY"`