Skip to content

Commit 05cc9a8

Browse files
authored
Revert "[Error Solved] 'executable_path' "
1 parent ab000eb commit 05cc9a8

File tree

1 file changed

+86
-112
lines changed

1 file changed

+86
-112
lines changed

Web_app/Scarper.py

Lines changed: 86 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -1,119 +1,93 @@
1-
import streamlit as st
2-
from selenium import webdriver
3-
from webdriver_manager.chrome import ChromeDriverManager
4-
from selenium.webdriver.chrome.service import Service
1+
from selenium.webdriver.common.by import By
2+
from selenium.webdriver.common.keys import Keys
3+
from selenium.webdriver.support.ui import WebDriverWait
4+
from selenium.webdriver.support import expected_conditions as EC
55
import time
66
import csv
77
import re
88
from bs4 import BeautifulSoup
9-
import os
10-
from streamlit_lottie import st_lottie
11-
import json
12-
13-
with open('Movie_Animated.json', encoding='utf-8') as anim_source:
14-
animation_data = json.load(anim_source)
15-
st_lottie(animation_data, 1, True, True, "high", 150, -100)
16-
17-
# Function to scrape IMDb data
18-
def scrape_imdb_data():
19-
options = webdriver.ChromeOptions()
20-
options.add_argument('--no-sandbox')
21-
options.add_argument('--disable-dev-shm-usage')
22-
options.add_argument('--headless') # Run Chrome in headless mode
23-
24-
service = Service(ChromeDriverManager().install())
25-
driver = webdriver.Chrome(options=options, service=service)
26-
27-
driver.get('https://www.imdb.com/search/title/?title_type=tv_series,feature,tv_movie,tv_episode,tv_miniseries,tv_special&release_date=2000-01-01,2024-12-31')
28-
driver.set_script_timeout(10000)
29-
30-
def load_more_results():
31-
try:
32-
load_more_button = WebDriverWait(driver, 10).until(
33-
EC.element_to_be_clickable((By.XPATH, '//button[contains(@class, "ipc-see-more__button")]'))
34-
)
35-
driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)
36-
driver.execute_script("arguments[0].click();", load_more_button)
37-
time.sleep(2)
38-
return True
39-
except Exception as e:
40-
print(f"Error: {e}")
41-
return False
42-
43-
def save_to_csv(movies, filename='movies.csv'):
44-
file_exists = os.path.isfile(filename)
45-
keys = movies[0].keys()
46-
with open(filename, 'a', newline='', encoding='utf-8') as output_file:
47-
dict_writer = csv.DictWriter(output_file, fieldnames=keys)
48-
if not file_exists:
49-
dict_writer.writeheader()
50-
dict_writer.writerows(movies)
9+
from selenium.webdriver.chrome.options import Options
10+
from selenium import webdriver
5111

52-
all_movies = []
53-
cnt = 0
54-
while cnt < 300:
55-
cnt += 1
56-
if not load_more_results():
12+
DRIVER_PATH = 'E:/chromedriver-win64/chromedriver'
13+
# Initialize the Chrome driver
14+
15+
16+
options = webdriver.ChromeOptions()
17+
options.add_argument('--no-sandbox')
18+
options.add_argument('--disable-dev-shm-usage')
19+
driver = webdriver.Chrome(options=options,executable_path=DRIVER_PATH)
20+
21+
# Navigate to the URL
22+
driver.get('https://www.imdb.com/search/title/?title_type=tv_series,feature,tv_movie,tv_episode,tv_miniseries,tv_special&release_date=2000-01-01,2024-12-31')
23+
24+
driver.set_script_timeout(10000)
25+
def load_more_results():
26+
try:
27+
load_more_button = WebDriverWait(driver, 10).until(
28+
EC.element_to_be_clickable((By.XPATH, '//button[contains(@class, "ipc-see-more__button")]'))
29+
)
30+
driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)
31+
driver.execute_script("arguments[0].click();", load_more_button)
32+
time.sleep(2)
33+
return True
34+
except Exception as e:
35+
print(f"Error: {e}")
36+
return False
37+
def save_to_csv(movies, filename='movies.csv'):
38+
keys = movies[0].keys()
39+
with open(filename, 'a', newline='', encoding='utf-8') as output_file:
40+
dict_writer = csv.DictWriter(output_file, fieldnames=keys)
41+
dict_writer.writeheader()
42+
dict_writer.writerows(movies)
43+
44+
45+
all_movies=[]
46+
cnt=0
47+
while(cnt<300):
48+
cnt+=1
49+
print(cnt)
50+
if not load_more_results():
5751
break
58-
59-
movie_elements = driver.find_elements(By.XPATH, "//div[contains(@class, 'lister-item mode-advanced')]")
52+
53+
movie_elements = driver.find_element(By.XPATH, "/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul")
54+
print("movie_list")
55+
56+
html_content = movie_elements.get_attribute('outerHTML')
57+
print("html movie_list")
58+
soup = BeautifulSoup(html_content, 'html.parser')
59+
60+
lst= soup.find_all("li", class_="ipc-metadata-list-summary-item")
61+
print("list")
62+
for i in lst:
63+
org_title= i.find("h3",class_="ipc-title__text").text
64+
try:
65+
title=re.sub(r'\d+\.\s*', '', org_title)
66+
except:
67+
title="NA"
68+
try:
69+
year = i.find("span", class_="sc-b189961a-8 kLaxqf dli-title-metadata-item").text
6070

61-
for element in movie_elements:
62-
soup = BeautifulSoup(element.get_attribute('outerHTML'), 'html.parser')
63-
64-
try:
65-
org_title = soup.find("h3", class_="lister-item-header").find("a").text
66-
title = re.sub(r'\d+\.\s*', '', org_title)
67-
except:
68-
title = "NA"
69-
70-
try:
71-
year = soup.find("span", class_="lister-item-year").text
72-
except:
73-
year = "NA"
74-
75-
try:
76-
rating = soup.find("div", class_="ratings-bar").find("strong").text
77-
except:
78-
rating = "NA"
79-
80-
try:
81-
description = soup.find_all("p", class_="text-muted")[1].text.strip()
82-
except:
83-
description = "NA"
84-
85-
all_movies.append({
86-
'title': title,
87-
'type': "Tv-Series",
88-
'year': year,
89-
'rating': rating,
90-
'description': description
91-
})
92-
93-
if all_movies:
94-
save_to_csv(all_movies)
95-
all_movies = []
96-
97-
driver.quit()
98-
99-
# Streamlit App
100-
def main():
101-
st.title("IMDb Scraper")
102-
103-
if st.button("Scrape IMDb Data"):
104-
with st.spinner("Scraping IMDb data..."):
105-
scrape_imdb_data()
106-
st.success("Data scraped successfully!")
107-
108-
# Show the CSV file content
109-
st.subheader("Scraped IMDb Data:")
110-
filename = 'movies.csv'
111-
if os.path.exists(filename):
112-
with open(filename, 'r', encoding='utf-8') as file:
113-
csv_content = file.read()
114-
st.code(csv_content, language='csv')
115-
else:
116-
st.error("CSV file not found.")
117-
118-
if __name__ == "__main__":
119-
main()
71+
except:
72+
year="NA"
73+
try:
74+
rating = i.find("span", class_='ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating').text.split()[0]
75+
except:
76+
rating="NA"
77+
try:
78+
description = i.find("div", class_='ipc-html-content-inner-div').text
79+
except:
80+
description = "NA"
81+
all_movies.append({
82+
'title': title,
83+
'type':"Tv-Series",
84+
'year': year,
85+
'rating': rating,
86+
'description': description
87+
})
88+
89+
print("saving started")
90+
if all_movies:
91+
save_to_csv(all_movies)
92+
print("completed")
93+
driver.quit()

0 commit comments

Comments
 (0)