Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/temp/update-ky-youtube.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ jobs:
ref: feat/songUpdate
persist-credentials: false # μˆ˜λ™ 인증으둜 ν‘Έμ‹œ μ œμ–΄

- name: Use Node.js 18
- name: Use Node.js 20
uses: actions/setup-node@v4
with:
node-version: "18"
node-version: "20"

- name: Install pnpm
uses: pnpm/action-setup@v2
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/crawl-recent-tj.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ jobs:
steps:
- uses: actions/checkout@v4

- name: Use Node.js 18
- name: Use Node.js 20
uses: actions/setup-node@v4
with:
node-version: "18"
node-version: "20"

- name: Install pnpm
uses: pnpm/action-setup@v2
Expand Down
43 changes: 43 additions & 0 deletions .github/workflows/update-ky-youtube.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
name: Update ky by Youtube

# μ‹€ν–‰ μΌμ‹œ 쀑지
on:
schedule:
- cron: "0 14 * * *" # ν•œκ΅­ μ‹œκ°„ 23:00 μ‹€ν–‰ (UTC+9 β†’ UTC 14:00)
workflow_dispatch:

permissions:
contents: write # push κΆŒν•œμ„ μœ„ν•΄ ν•„μš”

jobs:
run-npm-task:
runs-on: ubuntu-latest

steps:
- name: Checkout branch
uses: actions/checkout@v4

- name: Use Node.js 20
uses: actions/setup-node@v4
with:
node-version: "20"

- name: Install pnpm
uses: pnpm/action-setup@v2
with:
version: 9
run_install: false

- name: Install dependencies
working-directory: packages/crawling
run: pnpm install

- name: Create .env file
working-directory: packages/crawling
run: |
echo "SUPABASE_URL=${{ secrets.SUPABASE_URL }}" >> .env
echo "SUPABASE_KEY=${{ secrets.SUPABASE_KEY }}" >> .env

- name: run update script - packages/crawling/crawlYoutube.ts
working-directory: packages/crawling
run: pnpm run ky-youtube
8 changes: 4 additions & 4 deletions apps/web/src/auth.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@ export default function AuthProvider({ children }: { children: React.ReactNode }
return;
}

// if (isPublicPath) {
// setIsAuthChecked(true);
// return;
// }
if (isPublicPath) {
setIsAuthChecked(true);
return;
}

// 이미 인증된 μƒνƒœλ©΄ λ°”λ‘œ 톡과 (ν•˜μ§€λ§Œ μ²΄ν¬λŠ” μˆ˜ν–‰)
const handleAuth = async () => {
Expand Down
210 changes: 130 additions & 80 deletions packages/crawling/src/crawling/crawlYoutube.ts
Original file line number Diff line number Diff line change
@@ -1,109 +1,159 @@
import * as cheerio from 'cheerio';
import puppeteer from 'puppeteer';
import puppeteer, { Browser, Page } from 'puppeteer';

import { getInvalidKYSongsDB, getSongsKyNullDB } from '@/supabase/getDB';
import { postInvalidKYSongsDB } from '@/supabase/postDB';
import { updateSongsKyDB } from '@/supabase/updateDB';
import { Song } from '@/types';
import { saveCrawlYoutubeFailedKYSongs, updateDataLog } from '@/utils/logData';

import { isValidKYExistNumber } from './isValidKYExistNumber';

// youtubeμ—μ„œ KY λ…Έλž˜λ°© 번호 크둀링
// crawlYoutubeValidμ—μ„œ μ§„ν–‰ν•˜λŠ” μ‹€μ œ μ‚¬μ΄νŠΈ 검증도 포함
// --- Constants ---
const BASE_YOUTUBE_SEARCH_URL = 'https://www.youtube.com/@KARAOKEKY/search';
// --- Helper Functions ---

// action μš°λΆ„νˆ¬ ν™˜κ²½μ—μ„œμ˜ ν˜Έν™˜μ„ μœ„ν•΄ μΆ”κ°€
const browser = await puppeteer.launch({
headless: true,
});

const page = await browser.newPage();

const baseUrl = 'https://www.youtube.com/@KARAOKEKY/search';

const scrapeSongNumber = async (query: string) => {
const searchUrl = `${baseUrl}?query=${encodeURIComponent(query)}`;

// page.goto의 waitUntil λ¬Έμ œμ˜€μŒ!
await page.goto(searchUrl, {
waitUntil: 'networkidle2',
timeout: 0,
});
/**
* ν…μŠ€νŠΈμ—μ„œ KY λ…Έλž˜λ°© 번호λ₯Ό μΆ”μΆœν•©λ‹ˆλ‹€.
*/
const extractKaraokeNumber = (title: string): string | null => {
const matchResult = title.match(/KY\.\s*(\d{2,5})\)/);
return matchResult ? matchResult[1] : null;
};

const html = await page.content();
const $ = cheerio.load(html);
/**
* 유튜브 검색 κ²°κ³Ό νŽ˜μ΄μ§€μ—μ„œ λ…Έλž˜ 번호λ₯Ό μŠ€ν¬λž˜ν•‘ν•©λ‹ˆλ‹€.
*/
const scrapeSongNumber = async (page: Page, query: string): Promise<string | null> => {
const searchUrl = `${BASE_YOUTUBE_SEARCH_URL}?query=${encodeURIComponent(query)}`;

// id contents 의 첫번째 ytd-item-section-renderer 찾기
// const firstItem = $("#contents ytd-item-section-renderer").first();
try {
// waitUntil을 톡해 λ„€νŠΈμ›Œν¬κ°€ μ•ˆμ •λ  λ•ŒκΉŒμ§€ λŒ€κΈ°
// 30초 νƒ€μž„μ•„μ›ƒ μ„€μ • (λ¬΄ν•œ λŒ€κΈ° λ°©μ§€)
await page.goto(searchUrl, {
waitUntil: 'networkidle2',
// timeout: 0,
});

const firstItem = $('ytd-video-renderer').first();
const html = await page.content();
const $ = cheerio.load(html);

// yt-formatted-string μ°ΎκΈ°
const title = firstItem.find('yt-formatted-string').first().text().trim();
const firstItem = $('ytd-video-renderer').first();

const karaokeNumber = extractKaraokeNumber(title);
// 검색 κ²°κ³Όκ°€ μ—†λŠ” 경우 처리
if (firstItem.length === 0) {
return null;
}

return karaokeNumber;
const title = firstItem.find('yt-formatted-string').first().text().trim();
return extractKaraokeNumber(title);
} catch (error) {
console.warn(`[Scraping Failed] Query: ${query}`, error);
return null;
}
};

const extractKaraokeNumber = (title: string) => {
// KY. μ°Ύκ³  ) κ°€ μ˜¬λ•ŒκΉŒμ§€ μ°ΎκΈ°
const matchResult = title.match(/KY\.\s*(\d{2,5})\)/);
const karaokeNumber = matchResult ? matchResult[1] : null;
return karaokeNumber;
/**
* μ„±κ³΅ν•œ 데이터λ₯Ό DB에 μ—…λ°μ΄νŠΈν•˜κ³  둜그λ₯Ό λ‚¨κΉλ‹ˆλ‹€.
*/
const handleSuccess = async (song: Song, kyNum: string) => {
const result = await updateSongsKyDB({ ...song, num_ky: kyNum });
// console.log(`[Update Success] ${song.title}: ${kyNum}`, result); // 둜그 λ„ˆλ¬΄ 많으면 주석 처리
// updateDataLog(result.success, 'crawlYoutubeSuccess.txt');
};

const updateData = async (data: Song) => {
const result = await updateSongsKyDB(data);
console.log(result);
updateDataLog(result.success, 'crawlYoutubeSuccess.txt');
updateDataLog(result.failed, 'crawlYoutubeFailed.txt');
/**
* μ‹€νŒ¨ν•œ 데이터λ₯Ό Invalid DB에 μ €μž₯ν•˜κ³  둜그λ₯Ό λ‚¨κΉλ‹ˆλ‹€.
*/
const handleFailure = async (song: Song) => {
await postInvalidKYSongsDB(song);
// updateDataLog(false, 'crawlYoutubeFailed.txt'); // false 둜그 처리 방식에 따라 μˆ˜μ • ν•„μš”
};

// failedSongs을 κ°€μ Έμ™€μ„œ μ‹€νŒ¨ν•œ λ…Έλž˜λ₯Ό κ±΄λ„ˆλ›°λŠ” 게 μ•„λ‹ˆλΌ μ‹€νŒ¨ μ‹œ update_dateλ₯Ό μˆ˜μ •ν•΄ μž‘μ—… μˆœμœ„λ₯Ό λ’€λ‘œ 미룬닀면?
const data = await getSongsKyNullDB();
const failedSongs = await getInvalidKYSongsDB();
// --- Main Logic ---

console.log('getSongsKyNullDB : ', data.length);
console.log('failedSongs : ', failedSongs.length);
let index = 0;
let successCount = 0;
const main = async () => {
console.log('πŸš€ 크둀링 μž‘μ—…μ„ μ‹œμž‘ν•©λ‹ˆλ‹€...');

for (const song of data) {
if (failedSongs.find(failedSong => failedSong.id === song.id)) {
continue;
}
const query = song.title + '-' + song.artist;
// 1. λΈŒλΌμš°μ € μ΄ˆκΈ°ν™”
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'], // λ¦¬λˆ…μŠ€ ν™˜κ²½ ν˜Έν™˜μ„±
});

let resultKyNum = null;
try {
resultKyNum = await scrapeSongNumber(query);
} catch (error) {
continue;
}

if (resultKyNum) {
let isValid = true;
try {
isValid = await isValidKYExistNumber(page, resultKyNum, song.title, song.artist);
} catch (error) {
continue;
const page = await browser.newPage();

// 2. 데이터 κ°€μ Έμ˜€κΈ°
// Promise.all둜 병렬 μš”μ²­ν•˜μ—¬ λŒ€κΈ° μ‹œκ°„ 단좕
const [targetSongs, failedSongs] = await Promise.all([
getSongsKyNullDB(),
getInvalidKYSongsDB(),
]);

console.log(`πŸ“Š kyκ°€ null인 λŒ€μƒ 곑: ${targetSongs.length}개`);
console.log(`🚫 이미 μ‹€νŒ¨ν•œ 곑(μœ νš¨ν•˜μ§€ μ•Šμ€ KY λ…Έλž˜λ°© 번호): ${failedSongs.length}개`);
console.log(`🎯 μΆ”κ°€ κ°€λŠ₯ν•œ μ΅œλŒ€ 곑 개수: ${targetSongs.length - failedSongs.length}개`);

// 3. μ΅œμ ν™”: μ‹€νŒ¨ν•œ 곑 IDλ₯Ό Set으둜 λ³€ν™˜ (검색 속도 O(1)둜 ν–₯상)
const failedSongIds = new Set(failedSongs.map(s => s.id));

let processedCount = 0;
let successCount = 0;

// 4. 순차 처리 루프
for (const song of targetSongs) {
processedCount++;
const query = `${song.title}-${song.artist}`;

// 4-1. 이미 μ‹€νŒ¨ν–ˆλ˜ 곑은 μŠ€ν‚΅
if (failedSongIds.has(song.id)) {
continue;
}

console.log(`[${processedCount}/${targetSongs.length}] 검색 쀑: ${query}`);

// 4-2. μŠ€ν¬λž˜ν•‘ μ‹œλ„
const resultKyNum = await scrapeSongNumber(page, query);

if (!resultKyNum) {
// 검색 κ²°κ³Ό μ—†μŒ -> μ‹€νŒ¨ 처리
console.log(`❌ 검색 κ²°κ³Ό μ—†μŒ: ${query}`);
await handleFailure(song);
continue;
}

// 4-3. 번호 μœ νš¨μ„± 검증 (μ‹€μ œ μ‘΄μž¬ν•˜λŠ” λ²ˆν˜ΈμΈμ§€ 2μ°¨ 확인)
let isValid = false;
try {
isValid = await isValidKYExistNumber(page, resultKyNum, song.title, song.artist);
} catch (error) {
console.error(`❌ 검증 쀑 μ—λŸ¬ λ°œμƒ: ${query}`, error);
// 검증 μ—λŸ¬ μ‹œ 일단 μ‹€νŒ¨ μ²˜λ¦¬ν•˜κ±°λ‚˜ continue
continue;
}

if (isValid) {
// 성곡 처리
await handleSuccess(song, resultKyNum);
successCount++;
console.log(`βœ… μ—…λ°μ΄νŠΈ μ™„λ£Œ: ${resultKyNum}`);
} else {
// μœ νš¨ν•˜μ§€ μ•Šμ€ 번호 -> μ‹€νŒ¨ 처리
await handleFailure(song);
console.log(`⚠️ μœ νš¨ν•˜μ§€ μ•Šμ€ 번호: ${resultKyNum}`);
}
}

if (!isValid) {
await postInvalidKYSongsDB(song);
continue;
} else {
await updateData({ ...song, num_ky: resultKyNum });
console.log('update song : ', resultKyNum);
successCount++;
}
} else await postInvalidKYSongsDB(song);

index++;
console.log(query);
console.log('scrapeSongNumber : ', index);
console.log('successCount : ', successCount);
}
console.log('------------------------------------------------');
console.log(`πŸŽ‰ λͺ¨λ“  μž‘μ—… μ™„λ£Œ! 총 성곡: ${successCount}건`);
} catch (error) {
console.error('πŸ”₯ 치λͺ…적인 μ—λŸ¬ λ°œμƒ:', error);
} finally {
// 5. μ’…λ£Œ 처리: μ—λŸ¬κ°€ λ‚˜λ“  μ•ˆ λ‚˜λ“  λΈŒλΌμš°μ €λŠ” λ°˜λ“œμ‹œ λ‹«μŒ
await browser.close();
console.log('πŸ”’ λΈŒλΌμš°μ € μ’…λ£Œλ¨');
}
};

browser.close();
// 슀크립트 μ‹€ν–‰
main();
Loading