HomeAboutMeBlogGuest
© 2025 Sejin Cha. All rights reserved.
Built with Next.js, deployed on Vercel
장지원 페이지/
📕
2024 UGRP
/
Member Page
Member Page
/
권태완
권태완
/
2024/11/05 - dataset crolling

2024/11/05 - dataset crolling

dataset crolling 하기 - sketch of emotions

  • sketch of {emotion}이라는 이름으로 이미지들을 감정당 1000개씩 저장해내는 코드
  • 구현 사항
    • 자동 크롤링 차단을 막기 위한 랜덤 헤더
    • 요청에 대해서 랜덤한 시간 초의 딜레이
    • 데이터를 일정한 포멧으로 저장하기
 
Tags
import os import time import random from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains from bs4 import BeautifulSoup import requests import pickle # 감정 리스트와 초기 파일 번호 설정 emotions = ["happiness", "anger", "disgust", "fear", "sadness", "surprise"] base_url = "https://www.google.com/search?q=sketch+of+{emotion}&tbm=isch&tbs=il:cl" # 퍼블릭 도메인 필터 추가 output_dir = "emotion_sketches" max_images = 100 # 각 감정당 필요한 이미지 수 (테스트용으로 100으로 설정) # User-Agent 목록 user_agents = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36", "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1", "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; AS; rv:11.0) like Gecko", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/18.17763 Safari/537.36", "Mozilla/5.0 (Linux; Android 8.0.0; SM-G950F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.116 Mobile Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.2 Safari/605.1.15", "Mozilla/5.0 (iPhone; CPU iPhone OS 12_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Mobile/15E148 Safari/604.1", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36", "Mozilla/5.0 (Windows Phone 10.0; Android 6.0.1; Microsoft; Lumia 950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Mobile Safari/537.36 Edge/15.15063", "Mozilla/5.0 (Linux; Android 9; Pixel 3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Mobile Safari/537.36", "Mozilla/5.0 (iPad; CPU OS 13_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0 Safari/605.1.15", "Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0", "Mozilla/5.0 (Linux; U; Android 7.0; en-US; Nexus 5X Build/NBD90W) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.116 Mobile Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36", "Mozilla/5.0 (Linux; U; Android 4.4.2; en-US; SM-T530NU Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/30.0.0.0 Safari/537.36" ] # 다양한 Accept-Language 목록 accept_languages = [ "en-US,en;q=0.9", "en-GB,en;q=0.8", "fr-FR,fr;q=0.9,en;q=0.8", "de-DE,de;q=0.9,en;q=0.7", "es-ES,es;q=0.9,en;q=0.6", "it-IT,it;q=0.9,en;q=0.8", "ja-JP,ja;q=0.9,en;q=0.8", "ko-KR,ko;q=0.9,en;q=0.7", "zh-CN,zh;q=0.9,en;q=0.8", "pt-BR,pt;q=0.9,en;q=0.7", "ru-RU,ru;q=0.9,en;q=0.8", "nl-NL,nl;q=0.9,en;q=0.7", "sv-SE,sv;q=0.9,en;q=0.6" ] # Chrome WebDriver 설정 chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") driver = webdriver.Chrome(options=chrome_options) print("chromedriver ready") driver.get("https://www.google.com") # 쿠키 파일이 있다면 로드하여 재사용 cookie_file = "cookies.pkl" if os.path.exists(cookie_file): with open(cookie_file, "rb") as f: cookies = pickle.load(f) for cookie in cookies: driver.add_cookie(cookie) print("Cookies loaded and added to the driver.") # 이미지 저장 폴더 생성 if not os.path.exists(output_dir): os.makedirs(output_dir) print(f"Output directory {output_dir} created.") # 각 감정별로 크롤링 및 이미지 저장 for emotion in emotions: print(f"Starting search for {emotion} sketches...") emotion_dir = os.path.join(output_dir, emotion) if not os.path.exists(emotion_dir): os.makedirs(emotion_dir) print(f"Directory {emotion_dir} created.") search_url = base_url.format(emotion=emotion) count = 0 page = 0 while count < max_images: # User-Agent 및 기타 헤더 랜덤 설정 headers = { "User-Agent": random.choice(user_agents), "Referer": "https://www.google.com/", "Accept-Language": random.choice(accept_languages), # 다양한 언어 설정 "Accept-Encoding": "gzip, deflate, br" } # 페이지 이동 print(f"Fetching page {page + 1} for {emotion}...") driver.get(search_url + f"&start={page * 20}") time.sleep(random.uniform(20, 60)) # 요청 간 40초~90초 사이 랜덤 지연 # BeautifulSoup로 페이지 HTML 파싱 soup = BeautifulSoup(driver.page_source, "html.parser") images = soup.find_all("img", {"class": "rg_i Q4LuWd"}) print(f"Found {len(images)} images on page {page + 1} for {emotion}.") # 페이지 스크롤 및 랜덤 인터랙션 driver.execute_script("window.scrollBy(0, window.innerHeight);") action = ActionChains(driver) action.move_by_offset(random.randint(0, 100), random.randint(0, 100)).perform() time.sleep(random.uniform(1, 3)) # 추가 지연 # 이미지 저장 for img in images: try: img_url = img.get("src") or img.get("data-src") if img_url: # 이미지 다운로드 및 예외 처리 try: print(f"Downloading image {count + 1} for {emotion}...") response = requests.get(img_url, headers=headers, timeout=10) response.raise_for_status() # HTTP 에러 발생 시 예외 발생 img_format = "jpg" if "jpg" in img_url else "png" file_name = f"{emotion}_{count:05d}.{img_format}" file_path = os.path.join(emotion_dir, file_name) with open(file_path, "wb") as handler: handler.write(response.content) print(f"Saved {file_name}") count += 1 if count >= max_images: break # 감정당 100장 저장 후 종료 except requests.exceptions.RequestException as e: print(f"Error downloading {img_url}: {e}") # HTTP 에러, 타임아웃 등 예외 처리 continue except Exception as e: print(f"Unexpected error with image {count}: {e}") continue page += 1 # 다음 페이지로 이동 # 쿠키 저장 (첫 번째 세션 이후 한 번만 저장) if page == 1 and count == 0: with open(cookie_file, "wb") as f: pickle.dump(driver.get_cookies(), f) driver.quit()