Skip to content

Commit c4cca9a

Browse files
authored
Merge pull request #943 from ScrapeGraphAI/pre/beta
Pre/beta
2 parents 4ec2ca6 + 6a101e2 commit c4cca9a

11 files changed

+370
-134
lines changed

CHANGELOG.md

+12
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,15 @@
1+
## [1.41.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.40.1...v1.41.0-beta.1) (2025-03-07)
2+
3+
4+
### Features
5+
6+
* add CLoD integration ([4e0e785](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/4e0e78582c3a75e64c5eba26ce40b5ffbf05d58e))
7+
8+
9+
### Test
10+
11+
* Add coverage improvement test for tests/test_generate_answer_node.py ([6769c0d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/6769c0d43ab72f1c8b520dd28d19f747b22f9b7c))
12+
113
## [1.40.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.40.0...v1.40.1) (2025-02-27)
214

315

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[project]
22
name = "scrapegraphai"
33

4-
version = "1.40.1"
4+
version = "1.41.0b1"
55

66

77

scrapegraphai/graphs/abstract_graph.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from pydantic import BaseModel
1414

1515
from ..helpers import models_tokens
16-
from ..models import DeepSeek, OneApi
16+
from ..models import CLoD, DeepSeek, OneApi
1717
from ..utils.logging import set_verbosity_info, set_verbosity_warning
1818

1919

@@ -164,6 +164,7 @@ def _create_llm(self, llm_config: dict) -> object:
164164
"deepseek",
165165
"ernie",
166166
"fireworks",
167+
"clod",
167168
"togetherai",
168169
}
169170

@@ -218,6 +219,7 @@ def _create_llm(self, llm_config: dict) -> object:
218219
"ernie",
219220
"deepseek",
220221
"togetherai",
222+
"clod",
221223
}:
222224
if llm_params["model_provider"] == "bedrock":
223225
llm_params["model_kwargs"] = {
@@ -229,6 +231,9 @@ def _create_llm(self, llm_config: dict) -> object:
229231
else:
230232
model_provider = llm_params.pop("model_provider")
231233

234+
if model_provider == "clod":
235+
return CLoD(**llm_params)
236+
232237
if model_provider == "deepseek":
233238
return DeepSeek(**llm_params)
234239

scrapegraphai/helpers/models_tokens.py

+32
Original file line numberDiff line numberDiff line change
@@ -261,5 +261,37 @@
261261
"mixtral-moe-8x22B-instruct": 65536,
262262
"mixtral-moe-8x7B-instruct": 65536,
263263
},
264+
"clod": {
265+
"open-mistral-7b": 32000,
266+
"Llama-3.1-70b": 128000,
267+
"Llama-3.1-405b": 128000,
268+
"Llama-3.3-70b": 128000,
269+
"Llama-3.1-8b": 128000,
270+
"gpt-4o": 128000,
271+
"gpt-4o-mini": 128000,
272+
"gpt-4-turbo": 128000,
273+
"claude-3-opus-latest": 200000,
274+
"gemini-1.5-flash-8b": 128000,
275+
"gemini-1.5-flash": 128000,
276+
"open-mixtral-8x7b": 32000,
277+
"open-mixtral-8x22b": 64000,
278+
"claude-3-5-sonnet-latest": 200000,
279+
"claude-3-haiku-20240307": 200000,
280+
"Qwen-2.5-Coder-32B": 32000,
281+
"Deepseek-R1-Distill-Llama-70B": 131072,
282+
"Deepseek-V3": 128000,
283+
"Qwen-2-VL-72B": 128000,
284+
"Deepseek-R1-Distill-Qwen-14B": 131072,
285+
"Deepseek-R1-Distill-Qwen-1.5B": 131072,
286+
"Deepseek-R1": 128000,
287+
"Deepseek-Llm-Chat-67B": 4096,
288+
"Qwen-2.5-7B": 132072,
289+
"Qwen-2.5-72B": 132072,
290+
"Qwen-2-72B": 128000,
291+
"o1": 200000,
292+
"gemini-2.0-flash-exp": 1000000,
293+
"grok-beta": 128000,
294+
"grok-2-latest": 128000,
295+
},
264296
"togetherai": {"Meta-Llama-3.1-70B-Instruct-Turbo": 128000},
265297
}

scrapegraphai/models/__init__.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,10 @@
22
This module contains the model definitions used in the ScrapeGraphAI application.
33
"""
44

5+
from .clod import CLoD
56
from .deepseek import DeepSeek
67
from .oneapi import OneApi
78
from .openai_itt import OpenAIImageToText
89
from .openai_tts import OpenAITextToSpeech
910

10-
__all__ = [
11-
"DeepSeek",
12-
"OneApi",
13-
"OpenAIImageToText",
14-
"OpenAITextToSpeech",
15-
]
11+
__all__ = ["DeepSeek", "OneApi", "OpenAIImageToText", "OpenAITextToSpeech", "CLoD"]

scrapegraphai/models/clod.py

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
"""
2+
CLōD Module
3+
"""
4+
5+
from langchain_openai import ChatOpenAI
6+
7+
8+
class CLoD(ChatOpenAI):
9+
"""
10+
A wrapper for the ChatOpenAI class (CLōD uses an OpenAI-like API) that
11+
provides default configuration and could be extended with additional methods
12+
if needed.
13+
14+
Args:
15+
llm_config (dict): Configuration parameters for the language model.
16+
"""
17+
18+
def __init__(self, **llm_config):
19+
if "api_key" in llm_config:
20+
llm_config["openai_api_key"] = llm_config.pop("api_key")
21+
llm_config["openai_api_base"] = "https://api.clod.io/v1"
22+
23+
super().__init__(**llm_config)

tests/graphs/.env.example

+1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
OPENAI_API_KEY="YOUR OPENAI API KEY"
22
FIREWORKS_APIKEY="YOOUR FIREWORK KEY"
3+
CLOD_API_KEY="YOUR CLOD API KEY"

tests/graphs/abstract_graph_test.py

+34-1
Original file line numberDiff line numberDiff line change
@@ -199,4 +199,37 @@ def test_set_common_params(self):
199199
test_params = {"param1": "value1", "param2": "value2"}
200200
graph.set_common_params(test_params)
201201

202-
# Assert that update_config was called on each node with the correct parameters
202+
# Assert that update_config was called on each node with the correct parameters
203+
204+
def test_get_state(self):
205+
"""Test that get_state returns the correct final state with or without a provided key, and raises KeyError for missing keys."""
206+
graph = TestGraph("dummy", {"llm": {"model": "openai/gpt-3.5-turbo", "openai_api_key": "sk-test"}})
207+
# Set a dummy final state
208+
graph.final_state = {"answer": "42", "other": "value"}
209+
# Test without a key returns the entire final_state
210+
state = graph.get_state()
211+
assert state == {"answer": "42", "other": "value"}
212+
# Test with a valid key returns the specific value
213+
answer = graph.get_state("answer")
214+
assert answer == "42"
215+
# Test that a missing key raises a KeyError
216+
with pytest.raises(KeyError):
217+
_ = graph.get_state("nonexistent")
218+
219+
def test_append_node(self):
220+
"""Test that append_node correctly delegates to the graph's append_node method."""
221+
graph = TestGraph("dummy", {"llm": {"model": "openai/gpt-3.5-turbo", "openai_api_key": "sk-test"}})
222+
# Replace the graph object with a mock that has append_node
223+
mock_graph = Mock()
224+
graph.graph = mock_graph
225+
dummy_node = Mock()
226+
graph.append_node(dummy_node)
227+
mock_graph.append_node.assert_called_once_with(dummy_node)
228+
229+
def test_get_execution_info(self):
230+
"""Test that get_execution_info returns the execution info stored in the graph."""
231+
graph = TestGraph("dummy", {"llm": {"model": "openai/gpt-3.5-turbo", "openai_api_key": "sk-test"}})
232+
dummy_info = {"execution": "info", "status": "ok"}
233+
graph.execution_info = dummy_info
234+
info = graph.get_execution_info()
235+
assert info == dummy_info
+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
"""
2+
Module for testing the smart scraper class
3+
"""
4+
5+
import os
6+
7+
import pytest
8+
from dotenv import load_dotenv
9+
10+
from scrapegraphai.graphs import SmartScraperGraph
11+
12+
load_dotenv()
13+
14+
15+
@pytest.fixture
16+
def graph_config():
17+
"""Configuration of the graph"""
18+
clod_api_key = os.getenv("CLOD_API_KEY")
19+
return {
20+
"llm": {
21+
"api_key": clod_api_key,
22+
"model": "clod/claude-3-5-sonnet-latest",
23+
},
24+
"verbose": True,
25+
"headless": False,
26+
}
27+
28+
29+
def test_scraping_pipeline(graph_config):
30+
"""Start of the scraping pipeline"""
31+
smart_scraper_graph = SmartScraperGraph(
32+
prompt="List me all the projects with their description.",
33+
source="https://perinim.github.io/projects/",
34+
config=graph_config,
35+
)
36+
37+
result = smart_scraper_graph.run()
38+
39+
assert result is not None
40+
assert isinstance(result, dict)
41+
42+
43+
def test_get_execution_info(graph_config):
44+
"""Get the execution info"""
45+
smart_scraper_graph = SmartScraperGraph(
46+
prompt="List me all the projects with their description.",
47+
source="https://perinim.github.io/projects/",
48+
config=graph_config,
49+
)
50+
51+
smart_scraper_graph.run()
52+
53+
graph_exec_info = smart_scraper_graph.get_execution_info()
54+
55+
assert graph_exec_info is not None

tests/test_models_tokens.py

+148
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
import pytest
2+
from scrapegraphai.helpers.models_tokens import models_tokens
3+
4+
class TestModelsTokens:
5+
"""Test suite for verifying the models_tokens dictionary content and structure."""
6+
7+
def test_openai_tokens(self):
8+
"""Test that the 'openai' provider exists and its tokens are valid positive integers."""
9+
openai_models = models_tokens.get("openai")
10+
assert openai_models is not None, "'openai' key should be present in models_tokens"
11+
for model, token in openai_models.items():
12+
assert isinstance(model, str), "Model name should be a string"
13+
assert isinstance(token, int), "Token limit should be an integer"
14+
assert token > 0, "Token limit should be positive"
15+
16+
def test_azure_openai_tokens(self):
17+
"""Test that the 'azure_openai' provider exists and its tokens are valid."""
18+
azure_models = models_tokens.get("azure_openai")
19+
assert azure_models is not None, "'azure_openai' key should be present"
20+
for model, token in azure_models.items():
21+
assert isinstance(model, str), "Model name should be a string"
22+
assert isinstance(token, int), "Token limit should be an integer"
23+
24+
def test_google_providers(self):
25+
"""Test that Google provider dictionaries ('google_genai' and 'google_vertexai') contain expected entries."""
26+
google_genai = models_tokens.get("google_genai")
27+
google_vertexai = models_tokens.get("google_vertexai")
28+
assert google_genai is not None, "'google_genai' key should be present"
29+
assert google_vertexai is not None, "'google_vertexai' key should be present"
30+
# Check a specific key from google_genai
31+
assert "gemini-pro" in google_genai, "'gemini-pro' should be in google_genai models"
32+
# Validate token values types
33+
for provider in [google_genai, google_vertexai]:
34+
for token in provider.values():
35+
assert isinstance(token, int), "Token limit must be an integer"
36+
37+
def test_non_existent_provider(self):
38+
"""Test that a non-existent provider returns None."""
39+
assert models_tokens.get("non_existent") is None, "Non-existent provider should return None"
40+
41+
def test_total_model_keys(self):
42+
"""Test that the total number of models across all providers is above an expected count."""
43+
total_keys = sum(len(details) for details in models_tokens.values())
44+
assert total_keys > 20, "Expected more than 20 total model tokens defined"
45+
46+
def test_specific_token_value(self):
47+
"""Test specific expected token value for a known model."""
48+
openai = models_tokens.get("openai")
49+
# Verify that the token limit for "gpt-4" is 8192 as defined
50+
assert openai.get("gpt-4") == 8192, "Expected token limit for gpt-4 to be 8192"
51+
52+
def test_non_empty_model_keys(self):
53+
"""Ensure that model token names are non-empty strings."""
54+
for provider, model_dict in models_tokens.items():
55+
for model in model_dict.keys():
56+
assert model != "", f"Model name in provider '{provider}' should not be empty."
57+
58+
def test_token_limits_range(self):
59+
"""Test that token limits for all models fall within a plausible range (e.g., 1 to 300000)."""
60+
for provider, model_dict in models_tokens.items():
61+
for model, token in model_dict.items():
62+
assert 1 <= token <= 1100000, f"Token limit for {model} in provider {provider} is out of plausible range."
63+
def test_provider_structure(self):
64+
"""Test that every provider in models_tokens has a dictionary as its value."""
65+
for provider, models in models_tokens.items():
66+
assert isinstance(models, dict), f"Provider {provider} should map to a dictionary, got {type(models).__name__}"
67+
68+
def test_non_empty_provider(self):
69+
"""Test that each provider dictionary is not empty."""
70+
for provider, models in models_tokens.items():
71+
assert len(models) > 0, f"Provider {provider} should contain at least one model."
72+
73+
def test_specific_model_token_values(self):
74+
"""Test specific expected token values for selected models from various providers."""
75+
# Verify a token for a selected model from the 'openai' provider
76+
openai = models_tokens.get("openai")
77+
assert openai.get("gpt-3.5-turbo-0125") == 16385, "Expected token limit for gpt-3.5-turbo-0125 in openai to be 16385"
78+
79+
# Verify a token for a selected model from the 'azure_openai' provider
80+
azure = models_tokens.get("azure_openai")
81+
assert azure.get("gpt-3.5") == 4096, "Expected token limit for gpt-3.5 in azure_openai to be 4096"
82+
83+
# Verify a token for a selected model from the 'anthropic' provider
84+
anthropic = models_tokens.get("anthropic")
85+
assert anthropic.get("claude_instant") == 100000, "Expected token limit for claude_instant in anthropic to be 100000"
86+
87+
def test_providers_count(self):
88+
"""Test that the total number of providers is as expected (at least 15)."""
89+
assert len(models_tokens) >= 15, "Expected at least 15 providers in models_tokens"
90+
91+
def test_non_existent_model(self):
92+
"""Test that a non-existent model within a valid provider returns None."""
93+
openai = models_tokens.get("openai")
94+
assert openai.get("non_existent_model") is None, "Non-existent model should return None from a valid provider."
95+
def test_no_whitespace_in_model_names(self):
96+
"""Test that model names do not contain leading or trailing whitespace."""
97+
for provider, model_dict in models_tokens.items():
98+
for model in model_dict.keys():
99+
# Assert that stripping whitespace does not change the model name
100+
assert model == model.strip(), f"Model name '{model}' in provider '{provider}' contains leading or trailing whitespace."
101+
102+
def test_specific_models_additional(self):
103+
"""Test specific token values for additional models across various providers."""
104+
# Check some models in the 'ollama' provider
105+
ollama = models_tokens.get("ollama")
106+
assert ollama.get("llama2") == 4096, "Expected token limit for 'llama2' in ollama to be 4096"
107+
assert ollama.get("llama2:70b") == 4096, "Expected token limit for 'llama2:70b' in ollama to be 4096"
108+
109+
# Check a specific model from the 'mistralai' provider
110+
mistralai = models_tokens.get("mistralai")
111+
assert mistralai.get("open-codestral-mamba") == 256000, "Expected token limit for 'open-codestral-mamba' in mistralai to be 256000"
112+
113+
# Check a specific model from the 'deepseek' provider
114+
deepseek = models_tokens.get("deepseek")
115+
assert deepseek.get("deepseek-chat") == 28672, "Expected token limit for 'deepseek-chat' in deepseek to be 28672"
116+
117+
# Check a model from the 'ernie' provider
118+
ernie = models_tokens.get("ernie")
119+
assert ernie.get("ernie-bot") == 4096, "Expected token limit for 'ernie-bot' in ernie to be 4096"
120+
121+
def test_nvidia_specific(self):
122+
"""Test specific token value for 'meta/codellama-70b' in the nvidia provider."""
123+
nvidia = models_tokens.get("nvidia")
124+
assert nvidia is not None, "'nvidia' provider should exist"
125+
# Verify token for 'meta/codellama-70b' equals 16384 as defined in the nvidia dictionary
126+
assert nvidia.get("meta/codellama-70b") == 16384, "Expected token limit for 'meta/codellama-70b' in nvidia to be 16384"
127+
128+
def test_groq_specific(self):
129+
"""Test specific token value for 'claude-3-haiku-20240307\'' in the groq provider."""
130+
groq = models_tokens.get("groq")
131+
assert groq is not None, "'groq' provider should exist"
132+
# Note: The model name has an embedded apostrophe at the end in its name.
133+
assert groq.get("claude-3-haiku-20240307'") == 8192, "Expected token limit for 'claude-3-haiku-20240307\\'' in groq to be 8192"
134+
135+
def test_togetherai_specific(self):
136+
"""Test specific token value for 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo' in the toghetherai provider."""
137+
togetherai = models_tokens.get("toghetherai")
138+
assert togetherai is not None, "'toghetherai' provider should exist"
139+
expected = 128000
140+
model_name = "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"
141+
assert togetherai.get(model_name) == expected, f"Expected token limit for '{model_name}' in toghetherai to be {expected}"
142+
143+
def test_ernie_all_values(self):
144+
"""Test that all models in the 'ernie' provider have token values exactly 4096."""
145+
ernie = models_tokens.get("ernie")
146+
assert ernie is not None, "'ernie' provider should exist"
147+
for model, token in ernie.items():
148+
assert token == 4096, f"Expected token limit for '{model}' in ernie to be 4096, got {token}"

0 commit comments

Comments
 (0)