"""
뉴스 기사의 핵심 명사를 추출해 단어 빈도 막대그래프와 
워드클라우드를 생성하는 예시 코드입니다.
- 필요 패키지: konlpy, matplotlib, seaborn, pandas, wordcloud
"""

from pathlib import Path
import re
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from matplotlib import font_manager
from konlpy.tag import Okt
from wordcloud import WordCloud

# 데이터 파일을 로드(불러오기: news.txt)
with open("news.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()


def normalize(text: str) -> str:
    """한글과 숫자만 남기고 공백을 정규화한다."""
    text = re.sub(r"[^가-힣0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


# 분석 목적과 무관해 제외하고 싶은 단어는 이 집합에 자유롭게 추가/삭제/수정 가능
custom_stopwords = {"위해", "위한", "이번", "관련", "수", "등", "및", "통해"}

okt = Okt()
clean_text = normalize(raw_text)
nouns = []
# Okt 형태소 분석기로 명사만 뽑고, 한 글자짜리는 의미가 약하므로 제외
for token in okt.nouns(clean_text):
    if len(token) > 1:
        nouns.append(token)

filtered = []
# 불용어 목록에 포함된 단어는 최종 리스트에서 제외
for token in nouns:
    if token not in custom_stopwords:
        filtered.append(token)

# 단어 빈도를 집계해 상위 20개 단어를 확인
freqs = Counter(filtered)
top_words = freqs.most_common(20)

# 한글 시각화를 위해 사용할 폰트 후보를 순서대로 탐색
FONT_PATH = [
    Path("C:/Windows/Fonts/malgun.ttf"),  # Windows 시스템 폰트 경로
    Path("/System/Library/Fonts/AppleGothic.ttf"),  # macOS 시스템 폰트 경로
    Path("/usr/share/fonts/truetype/nanum/NanumGothic.ttf"),  # Linux 시스템 폰트 경로
]

font_path = None
for candidate in FONT_PATH:
    if candidate.exists():
        font_path = candidate
        break
if font_path is None:
    raise FileNotFoundError("한글 렌더링을 위해 사용 가능한 폰트 경로를 font_path에 추가하세요.")
font_name = font_manager.FontProperties(fname=str(font_path)).get_name()

# --- 빈도 막대그래프 ---
sns.set_theme(style="whitegrid")
plt.rcParams["font.family"] = font_name
freq_df = pd.DataFrame(top_words, columns=["word", "count"])
order = freq_df["word"].tolist()
# 막대그래프는 단어와 빈도를 직관적으로 비교할 때 유용
plt.figure(figsize=(12, 6))
sns.barplot(data=freq_df, x="word", y="count", order=order, palette="viridis")
plt.title("상위 명사 빈도 (news.txt)")
plt.xlabel("단어")
plt.ylabel("빈도")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

# --- 워드클라우드 ---
# 같은 빈도 정보를 시각적으로 강조하기 위해 워드클라우드를 함께 생성
wc = WordCloud(
    font_path=str(font_path),
    background_color="white",
    width=800,
    height=400,
)
wc = wc.generate_from_frequencies(freqs)

plt.figure(figsize=(10, 5))
# interpolation="bilinear" 옵션으로 픽셀 계단 현상을 완화
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud - news.txt")
plt.tight_layout()
plt.show()