feat: add web crawling management functionality
- Introduced a new crawling management feature allowing users to configure, execute, and log web crawls. - Added CRUD operations for crawl configurations, including URL analysis and preview capabilities. - Implemented a new service for handling crawling logic and scheduling tasks. - Integrated cheerio for HTML parsing and axios for HTTP requests. - Created a sample HTML page for testing crawling functionality. This commit enhances the application's data collection capabilities from external websites.
This commit is contained in:
@@ -115,6 +115,7 @@ import workHistoryRoutes from "./routes/workHistoryRoutes"; // 작업 이력 관
|
||||
import tableHistoryRoutes from "./routes/tableHistoryRoutes"; // 테이블 변경 이력 조회
|
||||
import bomRoutes from "./routes/bomRoutes"; // BOM 이력/버전 관리
|
||||
import productionRoutes from "./routes/productionRoutes"; // 생산계획 관리
|
||||
import crawlRoutes from "./routes/crawlRoutes"; // 웹 크롤링
|
||||
import roleRoutes from "./routes/roleRoutes"; // 권한 그룹 관리
|
||||
import departmentRoutes from "./routes/departmentRoutes"; // 부서 관리
|
||||
import tableCategoryValueRoutes from "./routes/tableCategoryValueRoutes"; // 카테고리 값 관리
|
||||
@@ -325,6 +326,7 @@ app.use("/api/work-history", workHistoryRoutes); // 작업 이력 관리
|
||||
app.use("/api/table-history", tableHistoryRoutes); // 테이블 변경 이력 조회
|
||||
app.use("/api/bom", bomRoutes); // BOM 이력/버전 관리
|
||||
app.use("/api/production", productionRoutes); // 생산계획 관리
|
||||
app.use("/api/crawl", crawlRoutes); // 웹 크롤링
|
||||
app.use("/api/material-status", materialStatusRoutes); // 자재현황
|
||||
app.use("/api/process-info", processInfoRoutes); // 공정정보관리
|
||||
app.use("/api/roles", roleRoutes); // 권한 그룹 관리
|
||||
@@ -415,6 +417,11 @@ async function initializeServices() {
|
||||
try {
|
||||
await BatchSchedulerService.initializeScheduler();
|
||||
logger.info(`⏰ 배치 스케줄러가 시작되었습니다.`);
|
||||
|
||||
// 크롤링 스케줄러 초기화
|
||||
const { CrawlService } = await import("./services/crawlService");
|
||||
await CrawlService.initializeScheduler();
|
||||
logger.info(`🕷️ 크롤링 스케줄러가 시작되었습니다.`);
|
||||
} catch (error) {
|
||||
logger.error(`❌ 배치 스케줄러 초기화 실패:`, error);
|
||||
}
|
||||
|
||||
124
backend-node/src/controllers/crawlController.ts
Normal file
124
backend-node/src/controllers/crawlController.ts
Normal file
@@ -0,0 +1,124 @@
|
||||
import { Request, Response } from "express";
|
||||
import { CrawlService } from "../services/crawlService";
|
||||
import { logger } from "../utils/logger";
|
||||
|
||||
interface AuthenticatedRequest extends Request {
|
||||
user?: { companyCode: string; userId: string };
|
||||
}
|
||||
|
||||
// 설정 목록 조회
|
||||
export async function getCrawlConfigs(req: AuthenticatedRequest, res: Response) {
|
||||
try {
|
||||
const companyCode = req.user?.companyCode || "*";
|
||||
const configs = await CrawlService.getConfigs(companyCode);
|
||||
return res.json({ success: true, data: configs });
|
||||
} catch (error: any) {
|
||||
logger.error("크롤링 설정 조회 실패:", error);
|
||||
return res.status(500).json({ success: false, message: error.message });
|
||||
}
|
||||
}
|
||||
|
||||
// 설정 상세 조회
|
||||
export async function getCrawlConfig(req: AuthenticatedRequest, res: Response) {
|
||||
try {
|
||||
const config = await CrawlService.getConfigById(req.params.id);
|
||||
if (!config) return res.status(404).json({ success: false, message: "설정을 찾을 수 없습니다." });
|
||||
return res.json({ success: true, data: config });
|
||||
} catch (error: any) {
|
||||
logger.error("크롤링 설정 상세 조회 실패:", error);
|
||||
return res.status(500).json({ success: false, message: error.message });
|
||||
}
|
||||
}
|
||||
|
||||
// 설정 생성
|
||||
export async function createCrawlConfig(req: AuthenticatedRequest, res: Response) {
|
||||
try {
|
||||
const data = {
|
||||
...req.body,
|
||||
company_code: req.user?.companyCode || req.body.company_code,
|
||||
writer: req.user?.userId,
|
||||
};
|
||||
const config = await CrawlService.createConfig(data);
|
||||
return res.json({ success: true, data: config });
|
||||
} catch (error: any) {
|
||||
logger.error("크롤링 설정 생성 실패:", error);
|
||||
return res.status(500).json({ success: false, message: error.message });
|
||||
}
|
||||
}
|
||||
|
||||
// 설정 수정
|
||||
export async function updateCrawlConfig(req: AuthenticatedRequest, res: Response) {
|
||||
try {
|
||||
const config = await CrawlService.updateConfig(req.params.id, req.body);
|
||||
if (!config) return res.status(404).json({ success: false, message: "설정을 찾을 수 없습니다." });
|
||||
return res.json({ success: true, data: config });
|
||||
} catch (error: any) {
|
||||
logger.error("크롤링 설정 수정 실패:", error);
|
||||
return res.status(500).json({ success: false, message: error.message });
|
||||
}
|
||||
}
|
||||
|
||||
// 설정 삭제
|
||||
export async function deleteCrawlConfig(req: AuthenticatedRequest, res: Response) {
|
||||
try {
|
||||
await CrawlService.deleteConfig(req.params.id);
|
||||
return res.json({ success: true });
|
||||
} catch (error: any) {
|
||||
logger.error("크롤링 설정 삭제 실패:", error);
|
||||
return res.status(500).json({ success: false, message: error.message });
|
||||
}
|
||||
}
|
||||
|
||||
// 미리보기
|
||||
export async function previewCrawl(req: AuthenticatedRequest, res: Response) {
|
||||
try {
|
||||
const { url, row_selector, column_mappings, method, headers, request_body } = req.body;
|
||||
if (!url) return res.status(400).json({ success: false, message: "URL은 필수입니다." });
|
||||
|
||||
const result = await CrawlService.preview(url, row_selector, column_mappings || [], method, headers, request_body);
|
||||
return res.json({ success: true, data: result });
|
||||
} catch (error: any) {
|
||||
logger.error("크롤링 미리보기 실패:", error);
|
||||
return res.status(500).json({ success: false, message: error.message });
|
||||
}
|
||||
}
|
||||
|
||||
// URL 자동 분석 — 페이지의 테이블/리스트 구조를 감지
|
||||
export async function analyzeUrl(req: AuthenticatedRequest, res: Response) {
|
||||
try {
|
||||
const { url } = req.body;
|
||||
if (!url) return res.status(400).json({ success: false, message: "URL은 필수입니다." });
|
||||
|
||||
const result = await CrawlService.analyzeUrl(url);
|
||||
return res.json({ success: true, data: result });
|
||||
} catch (error: any) {
|
||||
logger.error("URL 분석 실패:", error);
|
||||
return res.status(500).json({ success: false, message: error.message });
|
||||
}
|
||||
}
|
||||
|
||||
// 수동 실행
|
||||
export async function executeCrawl(req: AuthenticatedRequest, res: Response) {
|
||||
try {
|
||||
const config = await CrawlService.getConfigById(req.params.id);
|
||||
if (!config) return res.status(404).json({ success: false, message: "설정을 찾을 수 없습니다." });
|
||||
|
||||
const result = await CrawlService.executeCrawl(config);
|
||||
return res.json({ success: true, data: result });
|
||||
} catch (error: any) {
|
||||
logger.error("크롤링 수동 실행 실패:", error);
|
||||
return res.status(500).json({ success: false, message: error.message });
|
||||
}
|
||||
}
|
||||
|
||||
// 실행 로그 조회
|
||||
export async function getCrawlLogs(req: AuthenticatedRequest, res: Response) {
|
||||
try {
|
||||
const limit = parseInt(req.query.limit as string) || 20;
|
||||
const logs = await CrawlService.getLogs(req.params.id, limit);
|
||||
return res.json({ success: true, data: logs });
|
||||
} catch (error: any) {
|
||||
logger.error("크롤링 로그 조회 실패:", error);
|
||||
return res.status(500).json({ success: false, message: error.message });
|
||||
}
|
||||
}
|
||||
32
backend-node/src/routes/crawlRoutes.ts
Normal file
32
backend-node/src/routes/crawlRoutes.ts
Normal file
@@ -0,0 +1,32 @@
|
||||
import { Router } from "express";
|
||||
import { authenticateToken } from "../middleware/authMiddleware";
|
||||
import {
|
||||
getCrawlConfigs,
|
||||
getCrawlConfig,
|
||||
createCrawlConfig,
|
||||
updateCrawlConfig,
|
||||
deleteCrawlConfig,
|
||||
previewCrawl,
|
||||
analyzeUrl,
|
||||
executeCrawl,
|
||||
getCrawlLogs,
|
||||
} from "../controllers/crawlController";
|
||||
|
||||
const router = Router();
|
||||
|
||||
// 설정 CRUD
|
||||
router.get("/configs", authenticateToken, getCrawlConfigs);
|
||||
router.get("/configs/:id", authenticateToken, getCrawlConfig);
|
||||
router.post("/configs", authenticateToken, createCrawlConfig);
|
||||
router.put("/configs/:id", authenticateToken, updateCrawlConfig);
|
||||
router.delete("/configs/:id", authenticateToken, deleteCrawlConfig);
|
||||
|
||||
// 분석 & 미리보기 & 실행
|
||||
router.post("/analyze", authenticateToken, analyzeUrl);
|
||||
router.post("/preview", authenticateToken, previewCrawl);
|
||||
router.post("/execute/:id", authenticateToken, executeCrawl);
|
||||
|
||||
// 실행 로그
|
||||
router.get("/configs/:id/logs", authenticateToken, getCrawlLogs);
|
||||
|
||||
export default router;
|
||||
489
backend-node/src/services/crawlService.ts
Normal file
489
backend-node/src/services/crawlService.ts
Normal file
@@ -0,0 +1,489 @@
|
||||
import * as cheerio from "cheerio";
|
||||
import axios from "axios";
|
||||
import cron, { ScheduledTask } from "node-cron";
|
||||
import { query } from "../database/db";
|
||||
import { logger } from "../utils/logger";
|
||||
|
||||
export interface CrawlConfig {
|
||||
id: string;
|
||||
company_code: string;
|
||||
name: string;
|
||||
url: string;
|
||||
method: string;
|
||||
headers: Record<string, string>;
|
||||
request_body?: string;
|
||||
selector_type: string;
|
||||
row_selector: string;
|
||||
column_mappings: Array<{
|
||||
selector: string;
|
||||
column: string;
|
||||
type: "text" | "number" | "date";
|
||||
attribute?: string; // href, src 등 속성값 추출
|
||||
}>;
|
||||
target_table: string;
|
||||
upsert_key?: string;
|
||||
cron_schedule?: string;
|
||||
is_active: string;
|
||||
writer?: string;
|
||||
}
|
||||
|
||||
export interface CrawlResult {
|
||||
collected: number;
|
||||
saved: number;
|
||||
errors: string[];
|
||||
}
|
||||
|
||||
const DEFAULT_HEADERS = {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
|
||||
};
|
||||
|
||||
export class CrawlService {
|
||||
private static scheduledTasks: Map<string, ScheduledTask> = new Map();
|
||||
|
||||
// ─── 스케줄러 ───
|
||||
|
||||
static async initializeScheduler() {
|
||||
try {
|
||||
const configs = await query<CrawlConfig>(
|
||||
`SELECT * FROM crawl_configs WHERE is_active = 'Y' AND cron_schedule IS NOT NULL AND cron_schedule != ''`
|
||||
);
|
||||
|
||||
logger.info(`크롤링 스케줄러: ${configs.length}개 설정 등록`);
|
||||
|
||||
for (const config of configs) {
|
||||
this.scheduleConfig(config);
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error("크롤링 스케줄러 초기화 실패:", error);
|
||||
}
|
||||
}
|
||||
|
||||
static scheduleConfig(config: CrawlConfig) {
|
||||
if (!config.cron_schedule || !cron.validate(config.cron_schedule)) {
|
||||
logger.warn(`크롤링 [${config.name}]: 유효하지 않은 cron 표현식 - ${config.cron_schedule}`);
|
||||
return;
|
||||
}
|
||||
|
||||
// 기존 스케줄 제거
|
||||
if (this.scheduledTasks.has(config.id)) {
|
||||
this.scheduledTasks.get(config.id)!.stop();
|
||||
this.scheduledTasks.delete(config.id);
|
||||
}
|
||||
|
||||
const task = cron.schedule(
|
||||
config.cron_schedule,
|
||||
async () => {
|
||||
logger.info(`크롤링 [${config.name}] 스케줄 실행 시작`);
|
||||
await this.executeCrawl(config);
|
||||
},
|
||||
{ timezone: "Asia/Seoul" }
|
||||
);
|
||||
|
||||
this.scheduledTasks.set(config.id, task);
|
||||
logger.info(`크롤링 [${config.name}] 스케줄 등록: ${config.cron_schedule}`);
|
||||
}
|
||||
|
||||
static unscheduleConfig(configId: string) {
|
||||
if (this.scheduledTasks.has(configId)) {
|
||||
this.scheduledTasks.get(configId)!.stop();
|
||||
this.scheduledTasks.delete(configId);
|
||||
}
|
||||
}
|
||||
|
||||
// ─── CRUD ───
|
||||
|
||||
static async getConfigs(companyCode: string) {
|
||||
const condition = companyCode === "*" ? "" : "WHERE company_code = $1";
|
||||
const params = companyCode === "*" ? [] : [companyCode];
|
||||
return query<CrawlConfig>(`SELECT * FROM crawl_configs ${condition} ORDER BY created_date DESC`, params);
|
||||
}
|
||||
|
||||
static async getConfigById(id: string) {
|
||||
const rows = await query<CrawlConfig>(`SELECT * FROM crawl_configs WHERE id = $1`, [id]);
|
||||
return rows[0] || null;
|
||||
}
|
||||
|
||||
static async createConfig(data: Partial<CrawlConfig>) {
|
||||
const result = await query<CrawlConfig>(
|
||||
`INSERT INTO crawl_configs (company_code, name, url, method, headers, request_body, selector_type, row_selector, column_mappings, target_table, upsert_key, cron_schedule, is_active, writer)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14) RETURNING *`,
|
||||
[
|
||||
data.company_code,
|
||||
data.name,
|
||||
data.url,
|
||||
data.method || "GET",
|
||||
JSON.stringify(data.headers || {}),
|
||||
data.request_body || null,
|
||||
data.selector_type || "css",
|
||||
data.row_selector || null,
|
||||
JSON.stringify(data.column_mappings || []),
|
||||
data.target_table,
|
||||
data.upsert_key || null,
|
||||
data.cron_schedule || null,
|
||||
data.is_active || "Y",
|
||||
data.writer || null,
|
||||
]
|
||||
);
|
||||
|
||||
const config = result[0];
|
||||
if (config.is_active === "Y" && config.cron_schedule) {
|
||||
this.scheduleConfig(config);
|
||||
}
|
||||
return config;
|
||||
}
|
||||
|
||||
static async updateConfig(id: string, data: Partial<CrawlConfig>) {
|
||||
const result = await query<CrawlConfig>(
|
||||
`UPDATE crawl_configs SET
|
||||
name = COALESCE($2, name),
|
||||
url = COALESCE($3, url),
|
||||
method = COALESCE($4, method),
|
||||
headers = COALESCE($5, headers),
|
||||
request_body = $6,
|
||||
selector_type = COALESCE($7, selector_type),
|
||||
row_selector = $8,
|
||||
column_mappings = COALESCE($9, column_mappings),
|
||||
target_table = COALESCE($10, target_table),
|
||||
upsert_key = $11,
|
||||
cron_schedule = $12,
|
||||
is_active = COALESCE($13, is_active),
|
||||
updated_date = now()
|
||||
WHERE id = $1 RETURNING *`,
|
||||
[
|
||||
id,
|
||||
data.name,
|
||||
data.url,
|
||||
data.method,
|
||||
data.headers ? JSON.stringify(data.headers) : null,
|
||||
data.request_body ?? null,
|
||||
data.selector_type,
|
||||
data.row_selector ?? null,
|
||||
data.column_mappings ? JSON.stringify(data.column_mappings) : null,
|
||||
data.target_table,
|
||||
data.upsert_key ?? null,
|
||||
data.cron_schedule ?? null,
|
||||
data.is_active,
|
||||
]
|
||||
);
|
||||
|
||||
const config = result[0];
|
||||
if (config) {
|
||||
this.unscheduleConfig(id);
|
||||
if (config.is_active === "Y" && config.cron_schedule) {
|
||||
this.scheduleConfig(config);
|
||||
}
|
||||
}
|
||||
return config;
|
||||
}
|
||||
|
||||
static async deleteConfig(id: string) {
|
||||
this.unscheduleConfig(id);
|
||||
await query(`DELETE FROM crawl_configs WHERE id = $1`, [id]);
|
||||
}
|
||||
|
||||
// ─── 크롤링 실행 ───
|
||||
|
||||
static async executeCrawl(config: CrawlConfig): Promise<CrawlResult> {
|
||||
const logId = await this.createLog(config.id, config.company_code);
|
||||
const errors: string[] = [];
|
||||
let collected = 0;
|
||||
let saved = 0;
|
||||
|
||||
try {
|
||||
// 1. HTTP 요청
|
||||
const headers = { ...DEFAULT_HEADERS, ...(typeof config.headers === "string" ? JSON.parse(config.headers) : config.headers || {}) };
|
||||
const response = await axios({
|
||||
method: (config.method || "GET") as any,
|
||||
url: config.url,
|
||||
headers,
|
||||
data: config.request_body || undefined,
|
||||
timeout: 30000,
|
||||
responseType: "text",
|
||||
});
|
||||
|
||||
const html = response.data;
|
||||
const htmlPreview = typeof html === "string" ? html.substring(0, 2000) : "";
|
||||
|
||||
// 2. DOM 파싱
|
||||
const $ = cheerio.load(html);
|
||||
const mappings = typeof config.column_mappings === "string"
|
||||
? JSON.parse(config.column_mappings)
|
||||
: config.column_mappings || [];
|
||||
|
||||
// 3. 행 추출
|
||||
const rows: Record<string, any>[] = [];
|
||||
|
||||
if (config.row_selector) {
|
||||
$(config.row_selector).each((_, el) => {
|
||||
const row: Record<string, any> = {};
|
||||
for (const mapping of mappings) {
|
||||
const $el = $(el).find(mapping.selector);
|
||||
const raw = mapping.attribute ? $el.attr(mapping.attribute) || "" : $el.text().trim();
|
||||
row[mapping.column] = this.castValue(raw, mapping.type);
|
||||
}
|
||||
rows.push(row);
|
||||
});
|
||||
} else {
|
||||
// row_selector 없으면 column_mappings의 selector로 직접 추출 (단일 행)
|
||||
const row: Record<string, any> = {};
|
||||
for (const mapping of mappings) {
|
||||
const $el = $(mapping.selector);
|
||||
const raw = mapping.attribute ? $el.attr(mapping.attribute) || "" : $el.text().trim();
|
||||
row[mapping.column] = this.castValue(raw, mapping.type);
|
||||
}
|
||||
rows.push(row);
|
||||
}
|
||||
|
||||
collected = rows.length;
|
||||
|
||||
// 4. DB 저장
|
||||
for (const row of rows) {
|
||||
try {
|
||||
row.company_code = config.company_code;
|
||||
|
||||
if (config.upsert_key) {
|
||||
await this.upsertRow(config.target_table, row, config.upsert_key, config.company_code);
|
||||
} else {
|
||||
await this.insertRow(config.target_table, row);
|
||||
}
|
||||
saved++;
|
||||
} catch (err: any) {
|
||||
errors.push(`행 저장 실패: ${err.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// 5. 상태 업데이트
|
||||
await this.updateLog(logId, "success", collected, saved, null, htmlPreview);
|
||||
await query(
|
||||
`UPDATE crawl_configs SET last_executed_at = now(), last_status = 'success', last_error = null WHERE id = $1`,
|
||||
[config.id]
|
||||
);
|
||||
|
||||
logger.info(`크롤링 [${config.name}] 완료: ${collected}건 수집, ${saved}건 저장`);
|
||||
} catch (error: any) {
|
||||
const errMsg = error.message || "Unknown error";
|
||||
errors.push(errMsg);
|
||||
await this.updateLog(logId, "fail", collected, saved, errMsg, null);
|
||||
await query(
|
||||
`UPDATE crawl_configs SET last_executed_at = now(), last_status = 'fail', last_error = $2 WHERE id = $1`,
|
||||
[config.id, errMsg]
|
||||
);
|
||||
logger.error(`크롤링 [${config.name}] 실패:`, error);
|
||||
}
|
||||
|
||||
return { collected, saved, errors };
|
||||
}
|
||||
|
||||
// ─── URL 자동 분석 ───
|
||||
|
||||
static async analyzeUrl(url: string) {
|
||||
const response = await axios({
|
||||
method: "GET",
|
||||
url,
|
||||
headers: DEFAULT_HEADERS,
|
||||
timeout: 15000,
|
||||
responseType: "text",
|
||||
});
|
||||
|
||||
const $ = cheerio.load(response.data);
|
||||
const tables: Array<{
|
||||
index: number;
|
||||
selector: string;
|
||||
caption: string;
|
||||
headers: string[];
|
||||
rowCount: number;
|
||||
sampleRows: string[][];
|
||||
}> = [];
|
||||
|
||||
// HTML <table> 자동 감지
|
||||
$("table").each((i, tableEl) => {
|
||||
const $table = $(tableEl);
|
||||
// 헤더 추출
|
||||
const headers: string[] = [];
|
||||
$table.find("thead th, thead td, tr:first-child th").each((_, th) => {
|
||||
headers.push($(th).text().trim());
|
||||
});
|
||||
// 헤더가 없으면 첫 행에서 추출 시도
|
||||
if (headers.length === 0) {
|
||||
$table.find("tr:first-child td").each((_, td) => {
|
||||
headers.push($(td).text().trim());
|
||||
});
|
||||
}
|
||||
|
||||
// 데이터 행 수
|
||||
const bodyRows = $table.find("tbody tr");
|
||||
const allRows = bodyRows.length > 0 ? bodyRows : $table.find("tr").slice(headers.length > 0 ? 1 : 0);
|
||||
const rowCount = allRows.length;
|
||||
|
||||
// 샘플 (최대 3행)
|
||||
const sampleRows: string[][] = [];
|
||||
allRows.slice(0, 3).each((_, tr) => {
|
||||
const cells: string[] = [];
|
||||
$(tr).find("td, th").each((_, td) => {
|
||||
cells.push($(td).text().trim());
|
||||
});
|
||||
sampleRows.push(cells);
|
||||
});
|
||||
|
||||
if (headers.length > 0 || rowCount > 0) {
|
||||
// 선택자 생성
|
||||
let selector = "table";
|
||||
const id = $table.attr("id");
|
||||
const cls = $table.attr("class");
|
||||
if (id) selector = `table#${id}`;
|
||||
else if (cls) selector = `table.${cls.split(/\s+/)[0]}`;
|
||||
else if (i > 0) selector = `table:nth-of-type(${i + 1})`;
|
||||
|
||||
const caption = $table.find("caption").text().trim() || $table.attr("summary") || "";
|
||||
|
||||
tables.push({
|
||||
index: i,
|
||||
selector,
|
||||
caption,
|
||||
headers,
|
||||
rowCount,
|
||||
sampleRows,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return {
|
||||
title: $("title").text().trim(),
|
||||
tableCount: tables.length,
|
||||
tables,
|
||||
htmlLength: response.data.length,
|
||||
};
|
||||
}
|
||||
|
||||
// ─── 미리보기 ───
|
||||
|
||||
static async preview(
|
||||
url: string,
|
||||
rowSelector: string,
|
||||
columnMappings: CrawlConfig["column_mappings"],
|
||||
method = "GET",
|
||||
headers: Record<string, string> = {},
|
||||
requestBody?: string
|
||||
) {
|
||||
const mergedHeaders = { ...DEFAULT_HEADERS, ...headers };
|
||||
const response = await axios({
|
||||
method: method as any,
|
||||
url,
|
||||
headers: mergedHeaders,
|
||||
data: requestBody || undefined,
|
||||
timeout: 15000,
|
||||
responseType: "text",
|
||||
});
|
||||
|
||||
const $ = cheerio.load(response.data);
|
||||
const rows: Record<string, any>[] = [];
|
||||
|
||||
if (rowSelector) {
|
||||
$(rowSelector)
|
||||
.slice(0, 10) // 미리보기는 10행까지
|
||||
.each((_, el) => {
|
||||
const row: Record<string, any> = {};
|
||||
for (const mapping of columnMappings) {
|
||||
const $el = $(el).find(mapping.selector);
|
||||
const raw = mapping.attribute ? $el.attr(mapping.attribute) || "" : $el.text().trim();
|
||||
row[mapping.column] = this.castValue(raw, mapping.type);
|
||||
}
|
||||
rows.push(row);
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
totalElements: rowSelector ? $(rowSelector).length : 0,
|
||||
previewRows: rows,
|
||||
htmlLength: response.data.length,
|
||||
};
|
||||
}
|
||||
|
||||
// ─── 유틸 ───
|
||||
|
||||
private static castValue(raw: string, type: string): any {
|
||||
if (!raw) return null;
|
||||
switch (type) {
|
||||
case "number": {
|
||||
const cleaned = raw.replace(/[^0-9.\-]/g, "");
|
||||
const num = parseFloat(cleaned);
|
||||
return isNaN(num) ? null : num;
|
||||
}
|
||||
case "date":
|
||||
return raw;
|
||||
default:
|
||||
return raw;
|
||||
}
|
||||
}
|
||||
|
||||
private static async insertRow(tableName: string, row: Record<string, any>) {
|
||||
const cols = Object.keys(row);
|
||||
const vals = Object.values(row);
|
||||
const placeholders = cols.map((_, i) => `$${i + 1}`).join(", ");
|
||||
const colNames = cols.map((c) => `"${c}"`).join(", ");
|
||||
|
||||
await query(`INSERT INTO "${tableName}" (${colNames}) VALUES (${placeholders})`, vals);
|
||||
}
|
||||
|
||||
private static async upsertRow(tableName: string, row: Record<string, any>, upsertKey: string, companyCode: string) {
|
||||
const existing = await query(
|
||||
`SELECT 1 FROM "${tableName}" WHERE "${upsertKey}" = $1 AND company_code = $2 LIMIT 1`,
|
||||
[row[upsertKey], companyCode]
|
||||
);
|
||||
|
||||
if (existing.length > 0) {
|
||||
const setClauses: string[] = [];
|
||||
const vals: any[] = [];
|
||||
let idx = 1;
|
||||
for (const [k, v] of Object.entries(row)) {
|
||||
if (k === upsertKey || k === "company_code") continue;
|
||||
setClauses.push(`"${k}" = $${idx}`);
|
||||
vals.push(v);
|
||||
idx++;
|
||||
}
|
||||
if (setClauses.length > 0) {
|
||||
vals.push(row[upsertKey], companyCode);
|
||||
await query(
|
||||
`UPDATE "${tableName}" SET ${setClauses.join(", ")}, updated_date = now() WHERE "${upsertKey}" = $${idx} AND company_code = $${idx + 1}`,
|
||||
vals
|
||||
);
|
||||
}
|
||||
} else {
|
||||
await this.insertRow(tableName, row);
|
||||
}
|
||||
}
|
||||
|
||||
private static async createLog(configId: string, companyCode: string): Promise<string> {
|
||||
const result = await query<any>(
|
||||
`INSERT INTO crawl_execution_logs (config_id, company_code, status) VALUES ($1, $2, 'running') RETURNING id`,
|
||||
[configId, companyCode]
|
||||
);
|
||||
return result[0].id;
|
||||
}
|
||||
|
||||
private static async updateLog(
|
||||
logId: string,
|
||||
status: string,
|
||||
collected: number,
|
||||
saved: number,
|
||||
errorMessage: string | null,
|
||||
htmlPreview: string | null
|
||||
) {
|
||||
await query(
|
||||
`UPDATE crawl_execution_logs SET status = $2, rows_collected = $3, rows_saved = $4, error_message = $5, response_html_preview = $6, finished_at = now() WHERE id = $1`,
|
||||
[logId, status, collected, saved, errorMessage, htmlPreview]
|
||||
);
|
||||
}
|
||||
|
||||
// ─── 로그 조회 ───
|
||||
|
||||
static async getLogs(configId: string, limit = 20) {
|
||||
return query(
|
||||
`SELECT * FROM crawl_execution_logs WHERE config_id = $1 ORDER BY started_at DESC LIMIT $2`,
|
||||
[configId, limit]
|
||||
);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user