Skip to content

Commit e502c3b

Browse files
committed
feat: retry crawling with jsessionid from redis
1 parent 88999c1 commit e502c3b

File tree

1 file changed

+31
-11
lines changed

1 file changed

+31
-11
lines changed

apps/kaist/portal/crawler.py

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
from apps.kaist.models import Post
88
from apps.kaist.portal.post_response import PostResponse
9+
from ara import redis
10+
from ara.log import log
911
from ara.settings import PORTAL_JSESSIONID
1012

1113

@@ -14,8 +16,11 @@ class SessionExpiredException(Exception):
1416

1517

1618
class Crawler:
19+
SESSION_KEY = "JSESSIONID"
20+
SESSION_REDIS_KEY = "crawler:jsessionid"
21+
1722
_session = requests.Session()
18-
_session_id = PORTAL_JSESSIONID
23+
_session.cookies.set(SESSION_KEY, PORTAL_JSESSIONID)
1924

2025
_KST = pytz_timezone("Asia/Seoul")
2126

@@ -68,16 +73,26 @@ def get_post(cls, post_id: int) -> Post:
6873
:param post_id: The ID of the post to get
6974
"""
7075

71-
response = cls._session.get(
72-
url=f"https://portal.kaist.ac.kr/wz/api/board/recents/{post_id}?menuNo=21",
73-
cookies={"JSESSIONID": cls._session_id},
74-
)
76+
retry_count = 1
77+
78+
while retry_count >= 0:
79+
response = cls._session.get(
80+
f"https://portal.kaist.ac.kr/wz/api/board/recents/{post_id}?menuNo=21"
81+
)
7582

76-
if "application/json" not in response.headers["Content-Type"]:
77-
raise SessionExpiredException(f"Failed to get post {post_id}")
83+
if cls._has_fetched_successfully(response):
84+
post = cls._parse_response(response.json())
85+
return post
7886

79-
post = cls._parse_response(response.json())
80-
return post
87+
if retry_count == 0:
88+
raise SessionExpiredException(f"Failed to get post {post_id}")
89+
90+
cls.update_session_id()
91+
retry_count -= 1
92+
93+
@classmethod
94+
def _has_fetched_successfully(cls, response: requests.Response) -> bool:
95+
return "application/json" in response.headers["Content-Type"]
8196

8297
@classmethod
8398
def find_next_post(cls, post: Post) -> Post | None:
@@ -86,5 +101,10 @@ def find_next_post(cls, post: Post) -> Post | None:
86101
return cls.get_post(post.next_post_id)
87102

88103
@classmethod
89-
def update_session_id(cls, session_id: str) -> None:
90-
cls.session_id = session_id
104+
def update_session_id(cls) -> None:
105+
new_session_id = redis.get(cls.SESSION_REDIS_KEY).decode()
106+
if new_session_id is not None:
107+
log.info(
108+
f"KAIST Portal Crawler :: JSESSIONID updated to ({new_session_id})"
109+
)
110+
cls._session.cookies.set(cls.SESSION_KEY, new_session_id)

0 commit comments

Comments
 (0)