import * as cheerio from "cheerio"; import axios from "axios"; import cron, { ScheduledTask } from "node-cron"; import { query } from "../database/db"; import { logger } from "../utils/logger"; export interface CrawlConfig { id: string; company_code: string; name: string; url: string; method: string; headers: Record; request_body?: string; selector_type: string; row_selector: string; column_mappings: Array<{ selector: string; column: string; type: "text" | "number" | "date"; attribute?: string; // href, src 등 속성값 추출 }>; target_table: string; upsert_key?: string; cron_schedule?: string; is_active: string; writer?: string; } export interface CrawlResult { collected: number; saved: number; errors: string[]; } const DEFAULT_HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7", }; export class CrawlService { private static scheduledTasks: Map = new Map(); // ─── 스케줄러 ─── static async initializeScheduler() { try { const configs = await query( `SELECT * FROM crawl_configs WHERE is_active = 'Y' AND cron_schedule IS NOT NULL AND cron_schedule != ''` ); logger.info(`크롤링 스케줄러: ${configs.length}개 설정 등록`); for (const config of configs) { this.scheduleConfig(config); } } catch (error) { logger.error("크롤링 스케줄러 초기화 실패:", error); } } static scheduleConfig(config: CrawlConfig) { if (!config.cron_schedule || !cron.validate(config.cron_schedule)) { logger.warn(`크롤링 [${config.name}]: 유효하지 않은 cron 표현식 - ${config.cron_schedule}`); return; } // 기존 스케줄 제거 if (this.scheduledTasks.has(config.id)) { this.scheduledTasks.get(config.id)!.stop(); this.scheduledTasks.delete(config.id); } const task = cron.schedule( config.cron_schedule, async () => { logger.info(`크롤링 [${config.name}] 스케줄 실행 시작`); await this.executeCrawl(config); }, { timezone: "Asia/Seoul" } ); this.scheduledTasks.set(config.id, task); logger.info(`크롤링 [${config.name}] 스케줄 등록: ${config.cron_schedule}`); } static unscheduleConfig(configId: string) { if (this.scheduledTasks.has(configId)) { this.scheduledTasks.get(configId)!.stop(); this.scheduledTasks.delete(configId); } } // ─── CRUD ─── static async getConfigs(companyCode: string) { const condition = companyCode === "*" ? "" : "WHERE company_code = $1"; const params = companyCode === "*" ? [] : [companyCode]; return query(`SELECT * FROM crawl_configs ${condition} ORDER BY created_date DESC`, params); } static async getConfigById(id: string) { const rows = await query(`SELECT * FROM crawl_configs WHERE id = $1`, [id]); return rows[0] || null; } static async createConfig(data: Partial) { const result = await query( `INSERT INTO crawl_configs (company_code, name, url, method, headers, request_body, selector_type, row_selector, column_mappings, target_table, upsert_key, cron_schedule, is_active, writer) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14) RETURNING *`, [ data.company_code, data.name, data.url, data.method || "GET", JSON.stringify(data.headers || {}), data.request_body || null, data.selector_type || "css", data.row_selector || null, JSON.stringify(data.column_mappings || []), data.target_table, data.upsert_key || null, data.cron_schedule || null, data.is_active || "Y", data.writer || null, ] ); const config = result[0]; if (config.is_active === "Y" && config.cron_schedule) { this.scheduleConfig(config); } return config; } static async updateConfig(id: string, data: Partial) { const result = await query( `UPDATE crawl_configs SET name = COALESCE($2, name), url = COALESCE($3, url), method = COALESCE($4, method), headers = COALESCE($5, headers), request_body = $6, selector_type = COALESCE($7, selector_type), row_selector = $8, column_mappings = COALESCE($9, column_mappings), target_table = COALESCE($10, target_table), upsert_key = $11, cron_schedule = $12, is_active = COALESCE($13, is_active), updated_date = now() WHERE id = $1 RETURNING *`, [ id, data.name, data.url, data.method, data.headers ? JSON.stringify(data.headers) : null, data.request_body ?? null, data.selector_type, data.row_selector ?? null, data.column_mappings ? JSON.stringify(data.column_mappings) : null, data.target_table, data.upsert_key ?? null, data.cron_schedule ?? null, data.is_active, ] ); const config = result[0]; if (config) { this.unscheduleConfig(id); if (config.is_active === "Y" && config.cron_schedule) { this.scheduleConfig(config); } } return config; } static async deleteConfig(id: string) { this.unscheduleConfig(id); await query(`DELETE FROM crawl_configs WHERE id = $1`, [id]); } // ─── 크롤링 실행 ─── static async executeCrawl(config: CrawlConfig): Promise { const logId = await this.createLog(config.id, config.company_code); const errors: string[] = []; let collected = 0; let saved = 0; try { // 1. HTTP 요청 const headers = { ...DEFAULT_HEADERS, ...(typeof config.headers === "string" ? JSON.parse(config.headers) : config.headers || {}) }; const response = await axios({ method: (config.method || "GET") as any, url: config.url, headers, data: config.request_body || undefined, timeout: 30000, responseType: "text", }); const html = response.data; const htmlPreview = typeof html === "string" ? html.substring(0, 2000) : ""; // 2. DOM 파싱 const $ = cheerio.load(html); const mappings = typeof config.column_mappings === "string" ? JSON.parse(config.column_mappings) : config.column_mappings || []; // 3. 행 추출 const rows: Record[] = []; if (config.row_selector) { $(config.row_selector).each((_, el) => { const row: Record = {}; for (const mapping of mappings) { const $el = $(el).find(mapping.selector); const raw = mapping.attribute ? $el.attr(mapping.attribute) || "" : $el.text().trim(); row[mapping.column] = this.castValue(raw, mapping.type); } rows.push(row); }); } else { // row_selector 없으면 column_mappings의 selector로 직접 추출 (단일 행) const row: Record = {}; for (const mapping of mappings) { const $el = $(mapping.selector); const raw = mapping.attribute ? $el.attr(mapping.attribute) || "" : $el.text().trim(); row[mapping.column] = this.castValue(raw, mapping.type); } rows.push(row); } collected = rows.length; // 4. DB 저장 for (const row of rows) { try { row.company_code = config.company_code; if (config.upsert_key) { await this.upsertRow(config.target_table, row, config.upsert_key, config.company_code); } else { await this.insertRow(config.target_table, row); } saved++; } catch (err: any) { errors.push(`행 저장 실패: ${err.message}`); } } // 5. 상태 업데이트 await this.updateLog(logId, "success", collected, saved, null, htmlPreview); await query( `UPDATE crawl_configs SET last_executed_at = now(), last_status = 'success', last_error = null WHERE id = $1`, [config.id] ); logger.info(`크롤링 [${config.name}] 완료: ${collected}건 수집, ${saved}건 저장`); } catch (error: any) { const errMsg = error.message || "Unknown error"; errors.push(errMsg); await this.updateLog(logId, "fail", collected, saved, errMsg, null); await query( `UPDATE crawl_configs SET last_executed_at = now(), last_status = 'fail', last_error = $2 WHERE id = $1`, [config.id, errMsg] ); logger.error(`크롤링 [${config.name}] 실패:`, error); } return { collected, saved, errors }; } // ─── URL 자동 분석 ─── static async analyzeUrl(url: string) { const response = await axios({ method: "GET", url, headers: DEFAULT_HEADERS, timeout: 15000, responseType: "text", }); const $ = cheerio.load(response.data); const tables: Array<{ index: number; selector: string; caption: string; headers: string[]; rowCount: number; sampleRows: string[][]; }> = []; // HTML 자동 감지 $("table").each((i, tableEl) => { const $table = $(tableEl); // 헤더 추출 const headers: string[] = []; $table.find("thead th, thead td, tr:first-child th").each((_, th) => { headers.push($(th).text().trim()); }); // 헤더가 없으면 첫 행에서 추출 시도 if (headers.length === 0) { $table.find("tr:first-child td").each((_, td) => { headers.push($(td).text().trim()); }); } // 데이터 행 수 const bodyRows = $table.find("tbody tr"); const allRows = bodyRows.length > 0 ? bodyRows : $table.find("tr").slice(headers.length > 0 ? 1 : 0); const rowCount = allRows.length; // 샘플 (최대 3행) const sampleRows: string[][] = []; allRows.slice(0, 3).each((_, tr) => { const cells: string[] = []; $(tr).find("td, th").each((_, td) => { cells.push($(td).text().trim()); }); sampleRows.push(cells); }); if (headers.length > 0 || rowCount > 0) { // 선택자 생성 let selector = "table"; const id = $table.attr("id"); const cls = $table.attr("class"); if (id) selector = `table#${id}`; else if (cls) selector = `table.${cls.split(/\s+/)[0]}`; else if (i > 0) selector = `table:nth-of-type(${i + 1})`; const caption = $table.find("caption").text().trim() || $table.attr("summary") || ""; tables.push({ index: i, selector, caption, headers, rowCount, sampleRows, }); } }); return { title: $("title").text().trim(), tableCount: tables.length, tables, htmlLength: response.data.length, }; } // ─── 미리보기 ─── static async preview( url: string, rowSelector: string, columnMappings: CrawlConfig["column_mappings"], method = "GET", headers: Record = {}, requestBody?: string ) { const mergedHeaders = { ...DEFAULT_HEADERS, ...headers }; const response = await axios({ method: method as any, url, headers: mergedHeaders, data: requestBody || undefined, timeout: 15000, responseType: "text", }); const $ = cheerio.load(response.data); const rows: Record[] = []; if (rowSelector) { $(rowSelector) .slice(0, 10) // 미리보기는 10행까지 .each((_, el) => { const row: Record = {}; for (const mapping of columnMappings) { const $el = $(el).find(mapping.selector); const raw = mapping.attribute ? $el.attr(mapping.attribute) || "" : $el.text().trim(); row[mapping.column] = this.castValue(raw, mapping.type); } rows.push(row); }); } return { totalElements: rowSelector ? $(rowSelector).length : 0, previewRows: rows, htmlLength: response.data.length, }; } // ─── 유틸 ─── private static castValue(raw: string, type: string): any { if (!raw) return null; switch (type) { case "number": { const cleaned = raw.replace(/[^0-9.\-]/g, ""); const num = parseFloat(cleaned); return isNaN(num) ? null : num; } case "date": return raw; default: return raw; } } private static async insertRow(tableName: string, row: Record) { const cols = Object.keys(row); const vals = Object.values(row); const placeholders = cols.map((_, i) => `$${i + 1}`).join(", "); const colNames = cols.map((c) => `"${c}"`).join(", "); await query(`INSERT INTO "${tableName}" (${colNames}) VALUES (${placeholders})`, vals); } private static async upsertRow(tableName: string, row: Record, upsertKey: string, companyCode: string) { const existing = await query( `SELECT 1 FROM "${tableName}" WHERE "${upsertKey}" = $1 AND company_code = $2 LIMIT 1`, [row[upsertKey], companyCode] ); if (existing.length > 0) { const setClauses: string[] = []; const vals: any[] = []; let idx = 1; for (const [k, v] of Object.entries(row)) { if (k === upsertKey || k === "company_code") continue; setClauses.push(`"${k}" = $${idx}`); vals.push(v); idx++; } if (setClauses.length > 0) { vals.push(row[upsertKey], companyCode); await query( `UPDATE "${tableName}" SET ${setClauses.join(", ")}, updated_date = now() WHERE "${upsertKey}" = $${idx} AND company_code = $${idx + 1}`, vals ); } } else { await this.insertRow(tableName, row); } } private static async createLog(configId: string, companyCode: string): Promise { const result = await query( `INSERT INTO crawl_execution_logs (config_id, company_code, status) VALUES ($1, $2, 'running') RETURNING id`, [configId, companyCode] ); return result[0].id; } private static async updateLog( logId: string, status: string, collected: number, saved: number, errorMessage: string | null, htmlPreview: string | null ) { await query( `UPDATE crawl_execution_logs SET status = $2, rows_collected = $3, rows_saved = $4, error_message = $5, response_html_preview = $6, finished_at = now() WHERE id = $1`, [logId, status, collected, saved, errorMessage, htmlPreview] ); } // ─── 로그 조회 ─── static async getLogs(configId: string, limit = 20) { return query( `SELECT * FROM crawl_execution_logs WHERE config_id = $1 ORDER BY started_at DESC LIMIT $2`, [configId, limit] ); } }