import os
import time
import random
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import requests
import pickle
# 감정 리스트와 초기 파일 번호 설정
emotions = ["happiness", "anger", "disgust", "fear", "sadness", "surprise"]
base_url = "https://www.google.com/search?q=sketch+of+{emotion}&tbm=isch&tbs=il:cl" # 퍼블릭 도메인 필터 추가
output_dir = "emotion_sketches"
max_images = 100 # 각 감정당 필요한 이미지 수 (테스트용으로 100으로 설정)
# User-Agent 목록
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36",
"Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/18.17763 Safari/537.36",
"Mozilla/5.0 (Linux; Android 8.0.0; SM-G950F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.116 Mobile Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.2 Safari/605.1.15",
"Mozilla/5.0 (iPhone; CPU iPhone OS 12_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36",
"Mozilla/5.0 (Windows Phone 10.0; Android 6.0.1; Microsoft; Lumia 950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Mobile Safari/537.36 Edge/15.15063",
"Mozilla/5.0 (Linux; Android 9; Pixel 3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Mobile Safari/537.36",
"Mozilla/5.0 (iPad; CPU OS 13_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0",
"Mozilla/5.0 (Linux; U; Android 7.0; en-US; Nexus 5X Build/NBD90W) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.116 Mobile Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36",
"Mozilla/5.0 (Linux; U; Android 4.4.2; en-US; SM-T530NU Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/30.0.0.0 Safari/537.36"
]
# 다양한 Accept-Language 목록
accept_languages = [
"en-US,en;q=0.9",
"en-GB,en;q=0.8",
"fr-FR,fr;q=0.9,en;q=0.8",
"de-DE,de;q=0.9,en;q=0.7",
"es-ES,es;q=0.9,en;q=0.6",
"it-IT,it;q=0.9,en;q=0.8",
"ja-JP,ja;q=0.9,en;q=0.8",
"ko-KR,ko;q=0.9,en;q=0.7",
"zh-CN,zh;q=0.9,en;q=0.8",
"pt-BR,pt;q=0.9,en;q=0.7",
"ru-RU,ru;q=0.9,en;q=0.8",
"nl-NL,nl;q=0.9,en;q=0.7",
"sv-SE,sv;q=0.9,en;q=0.6"
]
# Chrome WebDriver 설정
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=chrome_options)
print("chromedriver ready")
driver.get("https://www.google.com")
# 쿠키 파일이 있다면 로드하여 재사용
cookie_file = "cookies.pkl"
if os.path.exists(cookie_file):
with open(cookie_file, "rb") as f:
cookies = pickle.load(f)
for cookie in cookies:
driver.add_cookie(cookie)
print("Cookies loaded and added to the driver.")
# 이미지 저장 폴더 생성
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print(f"Output directory {output_dir} created.")
# 각 감정별로 크롤링 및 이미지 저장
for emotion in emotions:
print(f"Starting search for {emotion} sketches...")
emotion_dir = os.path.join(output_dir, emotion)
if not os.path.exists(emotion_dir):
os.makedirs(emotion_dir)
print(f"Directory {emotion_dir} created.")
search_url = base_url.format(emotion=emotion)
count = 0
page = 0
while count < max_images:
# User-Agent 및 기타 헤더 랜덤 설정
headers = {
"User-Agent": random.choice(user_agents),
"Referer": "https://www.google.com/",
"Accept-Language": random.choice(accept_languages), # 다양한 언어 설정
"Accept-Encoding": "gzip, deflate, br"
}
# 페이지 이동
print(f"Fetching page {page + 1} for {emotion}...")
driver.get(search_url + f"&start={page * 20}")
time.sleep(random.uniform(20, 60)) # 요청 간 40초~90초 사이 랜덤 지연
# BeautifulSoup로 페이지 HTML 파싱
soup = BeautifulSoup(driver.page_source, "html.parser")
images = soup.find_all("img", {"class": "rg_i Q4LuWd"})
print(f"Found {len(images)} images on page {page + 1} for {emotion}.")
# 페이지 스크롤 및 랜덤 인터랙션
driver.execute_script("window.scrollBy(0, window.innerHeight);")
action = ActionChains(driver)
action.move_by_offset(random.randint(0, 100), random.randint(0, 100)).perform()
time.sleep(random.uniform(1, 3)) # 추가 지연
# 이미지 저장
for img in images:
try:
img_url = img.get("src") or img.get("data-src")
if img_url:
# 이미지 다운로드 및 예외 처리
try:
print(f"Downloading image {count + 1} for {emotion}...")
response = requests.get(img_url, headers=headers, timeout=10)
response.raise_for_status() # HTTP 에러 발생 시 예외 발생
img_format = "jpg" if "jpg" in img_url else "png"
file_name = f"{emotion}_{count:05d}.{img_format}"
file_path = os.path.join(emotion_dir, file_name)
with open(file_path, "wb") as handler:
handler.write(response.content)
print(f"Saved {file_name}")
count += 1
if count >= max_images:
break # 감정당 100장 저장 후 종료
except requests.exceptions.RequestException as e:
print(f"Error downloading {img_url}: {e}") # HTTP 에러, 타임아웃 등 예외 처리
continue
except Exception as e:
print(f"Unexpected error with image {count}: {e}")
continue
page += 1 # 다음 페이지로 이동
# 쿠키 저장 (첫 번째 세션 이후 한 번만 저장)
if page == 1 and count == 0:
with open(cookie_file, "wb") as f:
pickle.dump(driver.get_cookies(), f)
driver.quit()