fix(parsing.tgraph): no longer proxies images from *.wp.com

Rongronggg9 · Rongronggg9 · commit 5a675703b3a5 · 2024-09-17T23:28:39.000+08:00
Signed-off-by: Rongrong &lt;i@rong.moe&gt;
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 ## Unreleased
 
+### Enhancements
+
+- **No longer proxies images from `*.wp.com` when generating Telegraph posts**: `*.wp.com` is in the blocklist of `wsrv.nl` (environment variable `IMAGES_WESERV_NL`). Thus, these images are no longer proxied when generating Telegraph posts. All images from `*.wp.com` can be accessed with any referer header, so they are now kept as is.
+
 ### Bug fixes
 
 - **Canonical `DATABASE_URL` not recognized**: Since v2.9.0, `DATABASE_URL` is canonicalized before connecting to the corresponding database. However, a canonical URL pointing to a local path cannot be recognized when checking the validity of the scheme (database type). Both canonical (`scheme:/path/to/file.db`) and traditional (`scheme:///path/to/file.db`) forms of such URLs are recognized correctly now.
diff --git a/docs/CHANGELOG.zh.md b/docs/CHANGELOG.zh.md
@@ -2,6 +2,10 @@
 
 ## 未发布
 
+### 增强
+
+- **生成 Telegraph 文章时，不再代理来自 `*.wp.com` 的图像**: `*.wp.com` 位于 `wsrv.nl` (环境变量 `IMAGES_WESERV_NL`) 的阻断列表中。因此，在生成 Telegraph 文章时，这些图像不再被代理。来自 `*.wp.com` 的所有图片都可以用任何 refer 头访问，因此它们现在保持原样。
+
 ### Bug 修复
 
 - **无法识别规范的 `DATABASE_URL`**: 自 v2.9.0 起, 在连接到相应的数据库之前，`DATABASE_URL` 被规范化。然而，在检查 scheme (数据库类型) 的合法性时，无法识别指向本地路径的规范 URL。现在，此类 URL 的规范 (`scheme:/path/to/file.db`) 和传统 (`scheme:///path/to/file.db`) 形式都被正确识别。
diff --git a/src/parsing/tgraph.py b/src/parsing/tgraph.py
@@ -20,8 +20,9 @@
 from collections.abc import Awaitable
 
 import asyncio
-import time
 import aiographfix as aiograph
+import re
+import time
 from io import BytesIO
 from bs4 import BeautifulSoup
 from contextlib import suppress
@@ -40,6 +41,29 @@
 else:
     convert_table_to_png = None
 
+DOMAIN_PATTERN_TEMPLATE: Final[str] = r'^https?://(?:[^./]+\.)?(?:{domains})\.?(?:/|:|$)'
+BLOCKED_BY_WESERV_DOMAIN: Final[set[str]] = {
+    'sinaimg.cn',
+    'wp.com',
+}
+BLOCKED_BY_WESERV_RE: Final[re.Pattern] = re.compile(
+    DOMAIN_PATTERN_TEMPLATE.format(
+        domains='|'.join(map(re.escape, BLOCKED_BY_WESERV_DOMAIN)),
+    ),
+    re.I,
+)
+ALLOW_REFERER_DOMAIN: Final[set[str]] = set(filter(None, {
+    'wp.com',
+    env.IMG_RELAY_SERVER.partition('://')[2].partition('/')[0].strip('.'),
+    env.IMAGES_WESERV_NL.partition('://')[2].partition('/')[0].strip('.'),
+}))
+ALLOW_REFERER_RE: Final[re.Pattern] = re.compile(
+    DOMAIN_PATTERN_TEMPLATE.format(
+        domains='|'.join(map(re.escape, ALLOW_REFERER_DOMAIN)),
+    ),
+    re.I,
+)
+
 logger = log.getLogger('RSStT.tgraph')
 
 apis: Optional[APIs] = None
@@ -309,13 +333,15 @@ async def generate_page(self):
                     if not isAbsoluteHttpLink(attr_content):
                         tag.replaceWithChildren()
                         continue
-                    if not attr_content.startswith(env.IMG_RELAY_SERVER):
+                    if not ALLOW_REFERER_RE.match(attr_content):
                         if tag.name == 'video':
                             attr_content = env.IMG_RELAY_SERVER + attr_content
-                        if tag.name == 'img' and not attr_content.startswith(env.IMAGES_WESERV_NL):
-                            if attr_content.split('.', 1)[1].split('/', 1)[0] == 'sinaimg.cn':
-                                attr_content = env.IMG_RELAY_SERVER + attr_content
-                            attr_content = construct_weserv_url(attr_content)
+                        elif tag.name == 'img':
+                            attr_content = (
+                                env.IMG_RELAY_SERVER + attr_content
+                                if BLOCKED_BY_WESERV_RE.match(attr_content)
+                                else construct_weserv_url(attr_content)
+                            )
                     tag.attrs = {attr_name: attr_content}
 
         if self.feed_title: