Portfolio

📌

참고하면 좋을, 여러 데이터 셋 만드는 방법 Ref

나만의 데이터셋 만들기

데이터셋 구축 절차

[JSON]

[CSV]

[Or not…]

[Our Research]

How

우리 데이터 셋은 어떻게 만들어야 할까?

JSON이면

→ label: image, description, 가중치(?) → 오 가중치를 두어서 학습 시킬 수 있을까?

JSON이 아니면

→ 그냥 이미지에 label만 달아 두어도 될 것 같다.

나만의 데이터셋 만들기

데이터셋 구축 절차

++ 이 외에도 우리가 조사한 (1-10)척도를 활용한 분석을 하는 것도 좋을 것 같다.

Making Dataset


import json

# 감정 데이터 셋 정의
emotion_dataset = {
    "기쁨": [
        {
            "id": "01",
            "image": "https://github.com/JANGJIWONEDA/UGRP_jangjiwon/blob/main/dataset/amusement/%EB%82%A8%EC%88%98%EB%AF%BC.jpeg?raw=true",
            # 오른쪽 버튼 -> 이미지 링크 복사 
            "description": "웃고 있는 사람의 이미지",
            "weight1": "1",
            "weight2": "5"
        },
        # {
        #     "id": "02",
        #     "image": "1.png",
        #     "description": "활짝 웃는 아이의 이미지",
        #     "weight1": "1",
        #     "weight2": "5"
        # }
    ],
    # "슬픔": [
    #     {
    #         "id": "03",
    #         "image": "3.png",
    #         "description": "울고 있는 사람의 이미지",
    #         "weight1": "1",
    #         "weight2": "5"
    #     }
    # ]
}

# JSON 파일로 저장
with open('emotion_dataset.json', 'w', encoding='utf-8') as json_file:
    json.dump(emotion_dataset, json_file, ensure_ascii=False, indent=4)

print("JSON 파일 생성 완료!")


import json
import webbrowser

# JSON 파일 읽기
with open('emotion_dataset.json', 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

# 특정 ID에 해당하는 이미지 URL 찾기
target_id = "01"
image_url = None
for emotion, images in data.items():
    for image_info in images:
        if image_info['id'] == target_id:
            image_url = image_info['image']
            break
    if image_url:
        break

if image_url:
    try:
        # 웹 브라우저에서 이미지 열기
        webbrowser.open(image_url)
        print(f"이미지 '{image_url}'을(를) 웹 브라우저에서 성공적으로 열었습니다.")
    except Exception as e:
        print(f"이미지를 웹 브라우저에서 여는 데 실패했습니다: {e}")
else:
    print(f"ID '{target_id}'에 해당하는 이미지를 찾을 수 없습니다.")


import json
import requests
from PIL import Image
from io import BytesIO
import matplotlib.pyplot as plt

# JSON 파일 읽기
with open('emotion_dataset.json', 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

# 특정 ID에 해당하는 이미지 URL 찾기
target_id = "01"
image_url = None
for emotion, images in data.items():
    for image_info in images:
        if image_info['id'] == target_id:
            image_url = image_info['image']
            break
    if image_url:
        break

if image_url:
    try:
        # 이미지 URL에서 이미지 다운로드
        response = requests.get(image_url)
        response.raise_for_status()  # 요청이 성공했는지 확인

        # 이미지 열기
        img = Image.open(BytesIO(response.content))
        
        # Colab 환경에서 이미지 표시
        plt.imshow(img)
        plt.axis('off')  # 축 제거
        plt.show()

        print(f"이미지 '{image_url}'을(를) 성공적으로 다운로드하여 표시했습니다.")
    except Exception as e:
        print(f"이미지를 다운로드하거나 표시하는 데 실패했습니다: {e}")
else:
    print(f"ID '{target_id}'에 해당하는 이미지를 찾을 수 없습니다.")

Children Drawings

Kaggle is the world’s largest data science community with powerful tools and resources to help you achieve your data science goals.

→ emotion drawing dataset 키워드로 검색하면 많이 나온다.

NCBI - WWW Error Blocked Diagnostic

Your access to the NCBI website at www.ncbi.nlm.nih.gov has been temporarily blocked due to a possible misuse/abuse situation involving your site. This is not an indication of a security issue such as a virus or attack. It could be something as simple as a run away script or learning how to better use E-utilities, http://www.ncbi.nlm.nih.gov/books/NBK25497/, for more efficient work such that your work does not impact the ability of other researchers to also use our site. To restore access and understand how to better interact with our site to avoid this in the future, please have your system administrator contact info@ncbi.nlm.nih.gov.

[Dataset Making]

trash code


import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin

def download_images(query, num_images, folder_name):
    url = f"https://www.google.com/search?hl=en&tbm=isch&q={query}"
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    img_tags = soup.find_all('img', limit=num_images)

    if not os.path.exists('images'):
        os.makedirs('images')

    folder_path = os.path.join('images', folder_name)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    for i, img in enumerate(img_tags):
        img_url = img.get('src')
        if img_url:
            img_url = urljoin(url, img_url)  # 상대 경로를 절대 경로로 변환
            try:
                img_data = requests.get(img_url).content
                with open(f'{folder_path}/{folder_name}_{i}.jpg', 'wb') as file:
                    file.write(img_data)
            except Exception as e:
                print(f"Could not download {img_url}: {e}")

queries = [
    ("amusement drawing", "Amusement"),
    ("awe drawing", "Awe"),
    ("contentment drawing", "Contentment"),
    ("excitement drawing", "Excitement"),
    ("anger drawing", "Anger"),
    ("disgust drawing", "Disgust"),
    ("fear drawing", "Fear"),
    ("sadness drawing", "Sadness")
]

for query, folder_name in queries:
    download_images(query, 200, folder_name)

with Crawling

크롤링 하는 법

셀레니움이라는 라이브러리 사용

(주의: 코랩에서는 로컬에 접근할 수 없기 때문에 코랩에 크롬 드라이브, 크롬을 또 받아주어야 한다)

코랩에서 크롬 및 크롬 드라이브 다운 받는 코드 (크롬과 크롬 드라이버의 버전을 잘 맞춰야 한다)


# Chrome 버전: 128
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb
!apt --fix-broken install -y

# ChromeDriver 다운로드 및 설치
!wget https://storage.googleapis.com/chrome-for-testing-public/128.0.6613.119/linux64/chromedriver-linux64.zip
!unzip chromedriver-linux64.zip
!chmod +x ./chromedriver-linux64/chromedriver

실수로 폴더 만들었을 때 코랩에서 지우는 방법


import shutil

# 'folder_name'을 삭제합니다.
shutil.rmtree('chromedriver-linux64')

main 코드 (google image download 함수가 있기도 한데, 이건 잘 안먹힌다..) + 코랩에서는 GUI 없이 열어야 한다? → 코랩에서는 크롬이 아닌 크로미움이란 크롬의 개발자 버전을 사용해야 한다?

GG 코랩으로 크롤링 하기 너무 어렵다..

https://code-code.tistory.com/165

우선 되는 코드 → 클릭해야 다운이 된다 → 클릭해야 원본 이미지의 url을 가지고 올 수 있다고 한다.


import os
import time
import urllib.request
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options

# 키워드와 폴더 이름 설정
keywords = [
    "amusement drawing",
    "awe drawing",
    "contentment drawing",
    "excitement drawing",
    "anger drawing",
    "disgust drawing",
    "fear drawing",
    "sadness drawing"
]

# 이미지 저장할 폴더 경로
output_dir = os.path.expanduser("emotion_images")  # 바탕화면 경로

# 크롤링할 이미지 개수
num_images = 200

# 크롬 옵션 설정 (헤드리스 모드 비활성화)
chrome_options = Options()
# chrome_options.add_argument("--headless")  # GUI를 표시하지 않음 (디버깅 시 비활성화)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# 크롬 드라이버 설정
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

def download_image(image_url, folder_path, image_num):
    try:
        print(f"Downloading image {image_num}: {image_url}")
        if image_url.startswith("http"):
            opener = urllib.request.build_opener()
            opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
            urllib.request.install_opener(opener)
            file_name = f"{image_num}.jpg"
            file_path = os.path.join(folder_path, file_name)
            urllib.request.urlretrieve(image_url, file_path)
            print(f"Image {image_num} downloaded: {file_path}")
        else:
            print(f"Invalid URL for image {image_num}: {image_url}")
    except Exception as e:
        print(f"Error downloading image {image_num}: {e}")

def create_folder_if_not_exists(folder_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

def crawl_images(keyword):
    print(f"Searching images for: {keyword}")
    search_url = f"https://www.google.com/search?q={keyword}&tbm=isch"
    driver.get(search_url)
    
    image_urls = set()
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    while len(image_urls) < num_images:
        # 이미지 URL 추출
        images = driver.find_elements(By.CSS_SELECTOR, "img.rg_i")
        for image in images:
            try:
                # 이미지 src 속성 가져오기
                src = image.get_attribute("src")
                if src and src.startswith('http'):
                    image_urls.add(src)
                if len(image_urls) >= num_images:
                    break
            except Exception as e:
                print(f"Error while extracting image URL: {e}")
                pass

        # 페이지 스크롤 다운
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    return list(image_urls)

def main():
    create_folder_if_not_exists(output_dir)
    
    for keyword in keywords:
        folder_path = os.path.join(output_dir, keyword.replace(" ", "_"))
        create_folder_if_not_exists(folder_path)
        
        image_urls = crawl_images(keyword)
        print(f"Found {len(image_urls)} images for keyword: {keyword}")
        for i, url in enumerate(image_urls):
            download_image(url, folder_path, i + 1)
    
    driver.quit()

if __name__ == "__main__":
    main()

hugging face

데이터 허깅 페이스에 등록하는 법

[Dataset File]

[JSON]

[CSV]

[Or not…]

[Our Research]

[Dataset Making]