Portfolio

[데이터 만들기]

엑셀로 만들기/코드 작성

json 파일, 이미지 폴더 만들기

form excel로 만들기

코드 작성

엑셀 파일을 보면 똑같은 질문이 여러 열로 나누어져 있는 것을 확인할 수 있다. 값이 존재하는 열을 반환하도록 코드를 짜주었다.


import pandas as pd
import os
import json
import re
import requests
from PIL import Image
from io import BytesIO

# 엑셀 파일 경로
file_path = r'C:\Users\User\Desktop\감정 그림 그리기 설문조사(UGRP)(응답).xlsx'  # 실제 파일 경로로 수정하세요.
save_path = 'G:\\dataset'  # 실제 저장 경로로 수정하세요.

# 엑셀 파일 읽기
df = pd.read_excel(file_path)

# 해당 키워드를 포함하는 열들 중 NaN이 아닌 첫 번째 값을 선택하는 함수
def find_first_non_empty_value(row, keyword):
    matching_columns = [col for col in df.columns if keyword in col]
    
    # NaN이 아닌 첫 번째 값을 반환
    for col in matching_columns:
        if pd.notna(row[col]):
            return row[col]
    
    return None  # 모든 값이 NaN일 경우 None 반환

# 구글 드라이브 링크를 다운로드 가능한 링크로 변환하는 함수
def convert_drive_link(drive_url):
    file_id = drive_url.split('=')[-1]
    return f"https://drive.google.com/uc?export=download&id={file_id}"

# 이미지 다운로드 함수
def download_image(url, path):
    try:
        # 구글 드라이브 링크 변환
        if 'drive.google.com' in url:
            url = convert_drive_link(url)
        
        # 이미지 다운로드
        response = requests.get(url)
        image = Image.open(BytesIO(response.content))
        
        # 이미지 저장 경로에 해당 폴더가 없을 경우 생성
        folder_path = os.path.dirname(path)
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        
        image.save(path)
        print(f"Image saved to {path}")
    except Exception as e:
        print(f"Failed to download {url}: {e}")

# 필요한 열 추출
df_selected = pd.DataFrame({key: columns_dict[key] if isinstance(columns_dict[key], pd.Series) else df[columns_dict[key]] for key in columns_dict})

# 'intention' 키워드에 대해 NaN이 아닌 값을 추출
df_selected['name'] = df.apply(lambda row: find_first_non_empty_value(row, '참여자 성명'), axis=1)
df_selected['emotion'] = df.apply(lambda row: find_first_non_empty_value(row, '가까운 감정'), axis=1)
df_selected['image'] = df.apply(lambda row: find_first_non_empty_value(row, '업로드 해주세요'), axis=1)
df_selected['intention'] = df.apply(lambda row: find_first_non_empty_value(row, '그림의 의도를 적어주세요'), axis=1)
df_selected['weight1'] = df.apply(lambda row: find_first_non_empty_value(row, '위 예시 사진'), axis=1)
df_selected['weight2'] = df.apply(lambda row: find_first_non_empty_value(row, '자신이 그린 그림'), axis=1)

# 감정 열에서 한글 설명 제거
df_selected['emotion'] = df_selected['emotion'].apply(lambda x: re.sub(r'\([^)]*\)', '', str(x)).strip() if pd.notnull(x) else x)

# 감정별 폴더 생성 (이미지 저장 폴더 'image'도 함께 생성)
image_folder = os.path.join(save_path, 'image')
if not os.path.exists(image_folder):
    os.makedirs(image_folder)

# JSON 데이터 생성
json_data = []

for idx, row in df_selected.iterrows():
    image_id = row['name']
    emotion = row['emotion']
    image_url = row['image']
    intention = row['intention']
    weight1 = row['weight1']
    weight2 = row['weight2']
    
    # 감정별 하위 폴더 생성
    emotion_folder = os.path.join(image_folder, emotion)
    if not os.path.exists(emotion_folder):
        os.makedirs(emotion_folder)
    
    # 이미지 파일 저장 경로 (emotion/{image_id}.jpg 형식으로 저장)
    image_path = os.path.join(emotion_folder, f"{image_id}.jpg")
    
    # 이미지 다운로드 및 저장
    if pd.notnull(image_url):
        download_image(image_url, image_path)
    
    # JSON 데이터 추가
    json_data.append({
        'image': f"image/{emotion}/{image_id}.jpg",  # 경로를 JSON에 포함
        'label': emotion,
        'image_id': image_id,
        'weight1': weight1,
        'weight2': weight2,
        'intention': intention
    })

# JSON 파일 저장
json_file_path = os.path.join(save_path, 'dataset_info.json')
with open(json_file_path, 'w', encoding='utf-8') as f:
    json.dump(json_data, f, ensure_ascii=False, indent=4)

print("작업이 완료되었습니다.")

이미지 형식 때문에 다운이 안되는 것들 처리

RGBA 형식은 뭐 RGB 뭐시기뭐시기해서 RGB로 바꿔줘야 한다고 한다.

cannot write mode RGBA as JPEG 오류는 이미지의 모드가 JPEG 형식과 호환되지 않을 때 발생합니다. JPEG는 RGB 모드만 지원하고, RGBA 모드는 알파(투명도) 채널을 포함합니다. 이 경우 이미지를 JPEG로 저장하려고 할 때 오류가 발생합니다.

이 문제를 해결하려면 RGBA 모드를 RGB 모드로 변환한 후 저장해야 합니다. 아래와 같이 download_image 함수에서 이미지를 저장하기 전에 모드를 변환하도록 수정해 보세요:

최혜진

오민준

강지헌

이석민

이재훈

김승태

이하임

⇒ 공통점 png, pdf 등 jpg 파일이 아니다.

png, pdf를 모두 jpg로 바꿔서 다운로드 했다.

pdf변경 시 해야 할 것들

네이버 블로그 | 하이제니스파이썬(Python) - PDF를 JPEG로 변환하기 - pdf2image (윈도우 10 기준)

파이썬(Python) - PDF를 JPEG로 변환하기 - pdf2image (윈도우 10 기준)

파이썬(Python)을 이용해 "PDF"파일을 "JPEG"파일로 변환하는 방법은 여러 가...


pip install pdf2image Pillow

최종 코드


import pandas as pd
import os
import json
import re
import requests
from PIL import Image
from io import BytesIO
from pdf2image import convert_from_path
import tempfile

# 엑셀 파일 경로
file_path = r'C:\Users\User\Desktop\감정 그림 그리기 설문조사(UGRP)(응답).xlsx'  # 실제 파일 경로로 수정하세요.
save_path = 'G:\\dataset'  # 실제 저장 경로로 수정하세요.

# 엑셀 파일 읽기
df = pd.read_excel(file_path)

# 해당 키워드를 포함하는 열들 중 NaN이 아닌 첫 번째 값을 선택하는 함수
def find_first_non_empty_value(row, keyword):
    matching_columns = [col for col in df.columns if keyword in col]
    
    # NaN이 아닌 첫 번째 값을 반환
    for col in matching_columns:
        if pd.notna(row[col]):
            return row[col]
    
    return None  # 모든 값이 NaN일 경우 None 반환

# 구글 드라이브 링크를 다운로드 가능한 링크로 변환하는 함수
def convert_drive_link(drive_url):
    file_id = drive_url.split('=')[-1]
    return f"https://drive.google.com/uc?export=download&id={file_id}"

# 이미지 다운로드 함수
def download_image(url, path):
    try:
        # 구글 드라이브 링크 변환
        if 'drive.google.com' in url:
            url = convert_drive_link(url)
        
        # 이미지 다운로드
        response = requests.get(url)
        
        # 응답 상태 코드 확인
        if response.status_code != 200:
            print(f"Failed to download {url}: Status code {response.status_code}")
            return
        
        # 응답 데이터가 비어 있지 않은지 확인
        if not response.content:
            print(f"Failed to download {url}: Empty response")
            return
        
        # 파일의 첫 바이트를 확인하여 PDF인지 판단
        if response.content.startswith(b'%PDF'):
            # PDF 파일인 경우
            with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as temp_pdf:
                temp_pdf.write(response.content)
                temp_pdf_path = temp_pdf.name  # 임시 PDF 파일 경로
            
            # PDF를 이미지로 변환
            images = convert_from_path(temp_pdf_path)
            image = images[0]  # 첫 번째 페이지를 선택
            
            # 임시 PDF 파일 삭제
            os.remove(temp_pdf_path)
        else:
            # 이미지 열기
            image = Image.open(BytesIO(response.content))
        
        # RGBA 모드일 경우 RGB로 변환
        if image.mode == 'RGBA':
            image = image.convert('RGB')
        
        # 이미지 저장 경로에 해당 폴더가 없을 경우 생성
        folder_path = os.path.dirname(path)
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        
        image.save(path, 'JPEG')  # JPG 형식으로 저장
        print(f"Image saved to {path}")
        
    except Exception as e:
        print(f"Failed to download {url}: {e}")

# 필요한 열 추출
df_selected = pd.DataFrame()
df_selected['name'] = df.apply(lambda row: find_first_non_empty_value(row, '참여자 성명'), axis=1)
df_selected['emotion'] = df.apply(lambda row: find_first_non_empty_value(row, '가까운 감정'), axis=1)
df_selected['image'] = df.apply(lambda row: find_first_non_empty_value(row, '업로드 해주세요'), axis=1)
df_selected['intention'] = df.apply(lambda row: find_first_non_empty_value(row, '그림의 의도를 적어주세요'), axis=1)
df_selected['weight1'] = df.apply(lambda row: find_first_non_empty_value(row, '위 예시 사진'), axis=1)
df_selected['weight2'] = df.apply(lambda row: find_first_non_empty_value(row, '자신이 그린 그림'), axis=1)

# 감정 열에서 한글 설명 제거
df_selected['emotion'] = df_selected['emotion'].apply(lambda x: re.sub(r'\([^)]*\)', '', str(x)).strip() if pd.notnull(x) else x)

# 감정별 폴더 생성 (이미지 저장 폴더 'image'도 함께 생성)
image_folder = os.path.join(save_path, 'image')
if not os.path.exists(image_folder):
    os.makedirs(image_folder)

# JSON 데이터 생성
json_data = []

for idx, row in df_selected.iterrows():
    image_id = row['name']
    emotion = row['emotion']
    image_url = row['image']
    intention = row['intention']
    weight1 = row['weight1']
    weight2 = row['weight2']
    
    # 감정별 하위 폴더 생성
    emotion_folder = os.path.join(image_folder, emotion)
    if not os.path.exists(emotion_folder):
        os.makedirs(emotion_folder)
    
    # 이미지 파일 저장 경로 (emotion/{image_id}.jpg 형식으로 저장)
    image_path = os.path.join(emotion_folder, f"{image_id}.jpg")
    
    # 이미지 다운로드 및 저장
    if pd.notnull(image_url):
        download_image(image_url, image_path)
    
    # JSON 데이터 추가
    json_data.append({
        'image': f"image/{emotion}/{image_id}.jpg",  # 경로를 JSON에 포함
        'label': emotion,
        'image_id': image_id,
        'weight1': weight1,
        'weight2': weight2,
        'intention': intention
    })

# JSON 파일 저장
json_file_path = os.path.join(save_path, 'dataset_info.json')
with open(json_file_path, 'w', encoding='utf-8') as f:
    json.dump(json_data, f, ensure_ascii=False, indent=4)

print("작업이 완료되었습니다.")