Merge pull request #925 from ScrapeGraphAI/pre/beta

VinciGit00 · web-flow · commit cc024e1d8896 · 2025-02-17T16:33:33.000+01:00
Pre/beta
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,19 +1,10 @@
-## [1.38.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.38.0...v1.38.1) (2025-02-15)
+## [1.39.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.38.1-beta.1...v1.39.0-beta.1) (2025-02-17)
 
 
-### Bug Fixes
-
-* filter links ([04b9197](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/04b91972e88b69b722454d54c8635dfb49b38b44))
-
-
-### Test
-
-* Add coverage improvement test for tests/test_scrape_do.py ([4ce6d1b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/4ce6d1b94306d0ae94a74748726468a5132b7969))
-
+### Features
 
-### CI
+* add the new handling exception ([5c0bc46](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5c0bc46c6322ea07efa31d95819d7da47462f981))
 
-* **release:** 1.38.1-beta.1 [skip ci] ([83be82a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/83be82a11e83eb2be60a945deac361c46526c785))
 
 ## [1.38.1-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.38.0...v1.38.1-beta.1) (2025-02-13)
 
diff --git a/docs/assets/api-banner.png b/docs/assets/api-banner.png
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "scrapegraphai"
 
-version = "1.38.1"
+version = "1.39.0b1"
 
 
 description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py
@@ -80,6 +80,7 @@
         "llama3.2": 128000,
         "llama3.2:1b": 128000,
         "llama3.2:3b": 128000,
+        "llama3.3": 128000,
         "llama3.3:70b": 128000,
         "scrapegraph": 8192,
         "mistral-small": 128000,
diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py
@@ -3,6 +3,7 @@
 """
 
 import time
+import json
 from typing import List, Optional
 
 from langchain.prompts import PromptTemplate
@@ -120,7 +121,11 @@ def execute(self, state: dict) -> dict:
         else:
             if not isinstance(self.llm_model, ChatBedrock):
                 output_parser = JsonOutputParser()
-                format_instructions = output_parser.get_format_instructions()
+                format_instructions = (
+                    "You must respond with a JSON object. Your response should be formatted as a valid JSON "
+                    "with a 'content' field containing your analysis. For example:\n"
+                    '{"content": "your analysis here"}'
+                )
             else:
                 output_parser = None
                 format_instructions = ""
@@ -131,13 +136,25 @@ def execute(self, state: dict) -> dict:
             and not self.script_creator
             or self.is_md_scraper
         ):
-            template_no_chunks_prompt = TEMPLATE_NO_CHUNKS_MD
-            template_chunks_prompt = TEMPLATE_CHUNKS_MD
-            template_merge_prompt = TEMPLATE_MERGE_MD
+            template_no_chunks_prompt = (
+                TEMPLATE_NO_CHUNKS_MD + "\n\nIMPORTANT: " + format_instructions
+            )
+            template_chunks_prompt = (
+                TEMPLATE_CHUNKS_MD + "\n\nIMPORTANT: " + format_instructions
+            )
+            template_merge_prompt = (
+                TEMPLATE_MERGE_MD + "\n\nIMPORTANT: " + format_instructions
+            )
         else:
-            template_no_chunks_prompt = TEMPLATE_NO_CHUNKS
-            template_chunks_prompt = TEMPLATE_CHUNKS
-            template_merge_prompt = TEMPLATE_MERGE
+            template_no_chunks_prompt = (
+                TEMPLATE_NO_CHUNKS + "\n\nIMPORTANT: " + format_instructions
+            )
+            template_chunks_prompt = (
+                TEMPLATE_CHUNKS + "\n\nIMPORTANT: " + format_instructions
+            )
+            template_merge_prompt = (
+                TEMPLATE_MERGE + "\n\nIMPORTANT: " + format_instructions
+            )
 
         if self.additional_info is not None:
             template_no_chunks_prompt = self.additional_info + template_no_chunks_prompt
@@ -161,8 +178,9 @@ def execute(self, state: dict) -> dict:
                 answer = self.invoke_with_timeout(
                     chain, {"question": user_prompt}, self.timeout
                 )
-            except Timeout:
-                state.update({self.output[0]: {"error": "Response timeout exceeded"}})
+            except (Timeout, json.JSONDecodeError) as e:
+                error_msg = "Response timeout exceeded" if isinstance(e, Timeout) else "Invalid JSON response format"
+                state.update({self.output[0]: {"error": error_msg, "raw_response": str(e)}})
                 return state
 
             state.update({self.output[0]: answer})
@@ -191,14 +209,9 @@ def execute(self, state: dict) -> dict:
             batch_results = self.invoke_with_timeout(
                 async_runner, {"question": user_prompt}, self.timeout
             )
-        except Timeout:
-            state.update(
-                {
-                    self.output[0]: {
-                        "error": "Response timeout exceeded during chunk processing"
-                    }
-                }
-            )
+        except (Timeout, json.JSONDecodeError) as e:
+            error_msg = "Response timeout exceeded during chunk processing" if isinstance(e, Timeout) else "Invalid JSON response format in chunk processing"
+            state.update({self.output[0]: {"error": error_msg, "raw_response": str(e)}})
             return state
 
         merge_prompt = PromptTemplate(
@@ -216,10 +229,9 @@ def execute(self, state: dict) -> dict:
                 {"context": batch_results, "question": user_prompt},
                 self.timeout,
             )
-        except Timeout:
-            state.update(
-                {self.output[0]: {"error": "Response timeout exceeded during merge"}}
-            )
+        except (Timeout, json.JSONDecodeError) as e:
+            error_msg = "Response timeout exceeded during merge" if isinstance(e, Timeout) else "Invalid JSON response format during merge"
+            state.update({self.output[0]: {"error": error_msg, "raw_response": str(e)}})
             return state
 
         state.update({self.output[0]: answer})