1
- import streamlit as st
2
- from selenium import webdriver
3
- from webdriver_manager . chrome import ChromeDriverManager
4
- from selenium .webdriver .chrome . service import Service
1
+ from selenium . webdriver . common . by import By
2
+ from selenium . webdriver . common . keys import Keys
3
+ from selenium . webdriver . support . ui import WebDriverWait
4
+ from selenium .webdriver .support import expected_conditions as EC
5
5
import time
6
6
import csv
7
7
import re
8
8
from bs4 import BeautifulSoup
9
- import os
10
- from streamlit_lottie import st_lottie
11
- import json
12
-
13
- with open ('Movie_Animated.json' , encoding = 'utf-8' ) as anim_source :
14
- animation_data = json .load (anim_source )
15
- st_lottie (animation_data , 1 , True , True , "high" , 150 , - 100 )
16
-
17
- # Function to scrape IMDb data
18
- def scrape_imdb_data ():
19
- options = webdriver .ChromeOptions ()
20
- options .add_argument ('--no-sandbox' )
21
- options .add_argument ('--disable-dev-shm-usage' )
22
- options .add_argument ('--headless' ) # Run Chrome in headless mode
23
-
24
- service = Service (ChromeDriverManager ().install ())
25
- driver = webdriver .Chrome (options = options , service = service )
26
-
27
- driver .get ('https://www.imdb.com/search/title/?title_type=tv_series,feature,tv_movie,tv_episode,tv_miniseries,tv_special&release_date=2000-01-01,2024-12-31' )
28
- driver .set_script_timeout (10000 )
29
-
30
- def load_more_results ():
31
- try :
32
- load_more_button = WebDriverWait (driver , 10 ).until (
33
- EC .element_to_be_clickable ((By .XPATH , '//button[contains(@class, "ipc-see-more__button")]' ))
34
- )
35
- driver .execute_script ("arguments[0].scrollIntoView(true);" , load_more_button )
36
- driver .execute_script ("arguments[0].click();" , load_more_button )
37
- time .sleep (2 )
38
- return True
39
- except Exception as e :
40
- print (f"Error: { e } " )
41
- return False
42
-
43
- def save_to_csv (movies , filename = 'movies.csv' ):
44
- file_exists = os .path .isfile (filename )
45
- keys = movies [0 ].keys ()
46
- with open (filename , 'a' , newline = '' , encoding = 'utf-8' ) as output_file :
47
- dict_writer = csv .DictWriter (output_file , fieldnames = keys )
48
- if not file_exists :
49
- dict_writer .writeheader ()
50
- dict_writer .writerows (movies )
9
+ from selenium .webdriver .chrome .options import Options
10
+ from selenium import webdriver
51
11
52
- all_movies = []
53
- cnt = 0
54
- while cnt < 300 :
55
- cnt += 1
56
- if not load_more_results ():
12
+ DRIVER_PATH = 'E:/chromedriver-win64/chromedriver'
13
+ # Initialize the Chrome driver
14
+
15
+
16
+ options = webdriver .ChromeOptions ()
17
+ options .add_argument ('--no-sandbox' )
18
+ options .add_argument ('--disable-dev-shm-usage' )
19
+ driver = webdriver .Chrome (options = options ,executable_path = DRIVER_PATH )
20
+
21
+ # Navigate to the URL
22
+ driver .get ('https://www.imdb.com/search/title/?title_type=tv_series,feature,tv_movie,tv_episode,tv_miniseries,tv_special&release_date=2000-01-01,2024-12-31' )
23
+
24
+ driver .set_script_timeout (10000 )
25
+ def load_more_results ():
26
+ try :
27
+ load_more_button = WebDriverWait (driver , 10 ).until (
28
+ EC .element_to_be_clickable ((By .XPATH , '//button[contains(@class, "ipc-see-more__button")]' ))
29
+ )
30
+ driver .execute_script ("arguments[0].scrollIntoView(true);" , load_more_button )
31
+ driver .execute_script ("arguments[0].click();" , load_more_button )
32
+ time .sleep (2 )
33
+ return True
34
+ except Exception as e :
35
+ print (f"Error: { e } " )
36
+ return False
37
+ def save_to_csv (movies , filename = 'movies.csv' ):
38
+ keys = movies [0 ].keys ()
39
+ with open (filename , 'a' , newline = '' , encoding = 'utf-8' ) as output_file :
40
+ dict_writer = csv .DictWriter (output_file , fieldnames = keys )
41
+ dict_writer .writeheader ()
42
+ dict_writer .writerows (movies )
43
+
44
+
45
+ all_movies = []
46
+ cnt = 0
47
+ while (cnt < 300 ):
48
+ cnt += 1
49
+ print (cnt )
50
+ if not load_more_results ():
57
51
break
58
-
59
- movie_elements = driver .find_elements (By .XPATH , "//div[contains(@class, 'lister-item mode-advanced')]" )
52
+
53
+ movie_elements = driver .find_element (By .XPATH , "/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul" )
54
+ print ("movie_list" )
55
+
56
+ html_content = movie_elements .get_attribute ('outerHTML' )
57
+ print ("html movie_list" )
58
+ soup = BeautifulSoup (html_content , 'html.parser' )
59
+
60
+ lst = soup .find_all ("li" , class_ = "ipc-metadata-list-summary-item" )
61
+ print ("list" )
62
+ for i in lst :
63
+ org_title = i .find ("h3" ,class_ = "ipc-title__text" ).text
64
+ try :
65
+ title = re .sub (r'\d+\.\s*' , '' , org_title )
66
+ except :
67
+ title = "NA"
68
+ try :
69
+ year = i .find ("span" , class_ = "sc-b189961a-8 kLaxqf dli-title-metadata-item" ).text
60
70
61
- for element in movie_elements :
62
- soup = BeautifulSoup (element .get_attribute ('outerHTML' ), 'html.parser' )
63
-
64
- try :
65
- org_title = soup .find ("h3" , class_ = "lister-item-header" ).find ("a" ).text
66
- title = re .sub (r'\d+\.\s*' , '' , org_title )
67
- except :
68
- title = "NA"
69
-
70
- try :
71
- year = soup .find ("span" , class_ = "lister-item-year" ).text
72
- except :
73
- year = "NA"
74
-
75
- try :
76
- rating = soup .find ("div" , class_ = "ratings-bar" ).find ("strong" ).text
77
- except :
78
- rating = "NA"
79
-
80
- try :
81
- description = soup .find_all ("p" , class_ = "text-muted" )[1 ].text .strip ()
82
- except :
83
- description = "NA"
84
-
85
- all_movies .append ({
86
- 'title' : title ,
87
- 'type' : "Tv-Series" ,
88
- 'year' : year ,
89
- 'rating' : rating ,
90
- 'description' : description
91
- })
92
-
93
- if all_movies :
94
- save_to_csv (all_movies )
95
- all_movies = []
96
-
97
- driver .quit ()
98
-
99
- # Streamlit App
100
- def main ():
101
- st .title ("IMDb Scraper" )
102
-
103
- if st .button ("Scrape IMDb Data" ):
104
- with st .spinner ("Scraping IMDb data..." ):
105
- scrape_imdb_data ()
106
- st .success ("Data scraped successfully!" )
107
-
108
- # Show the CSV file content
109
- st .subheader ("Scraped IMDb Data:" )
110
- filename = 'movies.csv'
111
- if os .path .exists (filename ):
112
- with open (filename , 'r' , encoding = 'utf-8' ) as file :
113
- csv_content = file .read ()
114
- st .code (csv_content , language = 'csv' )
115
- else :
116
- st .error ("CSV file not found." )
117
-
118
- if __name__ == "__main__" :
119
- main ()
71
+ except :
72
+ year = "NA"
73
+ try :
74
+ rating = i .find ("span" , class_ = 'ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating' ).text .split ()[0 ]
75
+ except :
76
+ rating = "NA"
77
+ try :
78
+ description = i .find ("div" , class_ = 'ipc-html-content-inner-div' ).text
79
+ except :
80
+ description = "NA"
81
+ all_movies .append ({
82
+ 'title' : title ,
83
+ 'type' :"Tv-Series" ,
84
+ 'year' : year ,
85
+ 'rating' : rating ,
86
+ 'description' : description
87
+ })
88
+
89
+ print ("saving started" )
90
+ if all_movies :
91
+ save_to_csv (all_movies )
92
+ print ("completed" )
93
+ driver .quit ()
0 commit comments