feat: add web crawling management functionality

- Introduced a new crawling management feature allowing users to configure, execute, and log web crawls.
- Added CRUD operations for crawl configurations, including URL analysis and preview capabilities.
- Implemented a new service for handling crawling logic and scheduling tasks.
- Integrated cheerio for HTML parsing and axios for HTTP requests.
- Created a sample HTML page for testing crawling functionality.

This commit enhances the application's data collection capabilities from external websites.
This commit is contained in:
kjs
2026-03-26 16:30:53 +09:00
parent 07777e314b
commit 5da134f016
8 changed files with 1700 additions and 0 deletions

View File

@@ -0,0 +1,124 @@
import { Request, Response } from "express";
import { CrawlService } from "../services/crawlService";
import { logger } from "../utils/logger";
interface AuthenticatedRequest extends Request {
user?: { companyCode: string; userId: string };
}
// 설정 목록 조회
export async function getCrawlConfigs(req: AuthenticatedRequest, res: Response) {
try {
const companyCode = req.user?.companyCode || "*";
const configs = await CrawlService.getConfigs(companyCode);
return res.json({ success: true, data: configs });
} catch (error: any) {
logger.error("크롤링 설정 조회 실패:", error);
return res.status(500).json({ success: false, message: error.message });
}
}
// 설정 상세 조회
export async function getCrawlConfig(req: AuthenticatedRequest, res: Response) {
try {
const config = await CrawlService.getConfigById(req.params.id);
if (!config) return res.status(404).json({ success: false, message: "설정을 찾을 수 없습니다." });
return res.json({ success: true, data: config });
} catch (error: any) {
logger.error("크롤링 설정 상세 조회 실패:", error);
return res.status(500).json({ success: false, message: error.message });
}
}
// 설정 생성
export async function createCrawlConfig(req: AuthenticatedRequest, res: Response) {
try {
const data = {
...req.body,
company_code: req.user?.companyCode || req.body.company_code,
writer: req.user?.userId,
};
const config = await CrawlService.createConfig(data);
return res.json({ success: true, data: config });
} catch (error: any) {
logger.error("크롤링 설정 생성 실패:", error);
return res.status(500).json({ success: false, message: error.message });
}
}
// 설정 수정
export async function updateCrawlConfig(req: AuthenticatedRequest, res: Response) {
try {
const config = await CrawlService.updateConfig(req.params.id, req.body);
if (!config) return res.status(404).json({ success: false, message: "설정을 찾을 수 없습니다." });
return res.json({ success: true, data: config });
} catch (error: any) {
logger.error("크롤링 설정 수정 실패:", error);
return res.status(500).json({ success: false, message: error.message });
}
}
// 설정 삭제
export async function deleteCrawlConfig(req: AuthenticatedRequest, res: Response) {
try {
await CrawlService.deleteConfig(req.params.id);
return res.json({ success: true });
} catch (error: any) {
logger.error("크롤링 설정 삭제 실패:", error);
return res.status(500).json({ success: false, message: error.message });
}
}
// 미리보기
export async function previewCrawl(req: AuthenticatedRequest, res: Response) {
try {
const { url, row_selector, column_mappings, method, headers, request_body } = req.body;
if (!url) return res.status(400).json({ success: false, message: "URL은 필수입니다." });
const result = await CrawlService.preview(url, row_selector, column_mappings || [], method, headers, request_body);
return res.json({ success: true, data: result });
} catch (error: any) {
logger.error("크롤링 미리보기 실패:", error);
return res.status(500).json({ success: false, message: error.message });
}
}
// URL 자동 분석 — 페이지의 테이블/리스트 구조를 감지
export async function analyzeUrl(req: AuthenticatedRequest, res: Response) {
try {
const { url } = req.body;
if (!url) return res.status(400).json({ success: false, message: "URL은 필수입니다." });
const result = await CrawlService.analyzeUrl(url);
return res.json({ success: true, data: result });
} catch (error: any) {
logger.error("URL 분석 실패:", error);
return res.status(500).json({ success: false, message: error.message });
}
}
// 수동 실행
export async function executeCrawl(req: AuthenticatedRequest, res: Response) {
try {
const config = await CrawlService.getConfigById(req.params.id);
if (!config) return res.status(404).json({ success: false, message: "설정을 찾을 수 없습니다." });
const result = await CrawlService.executeCrawl(config);
return res.json({ success: true, data: result });
} catch (error: any) {
logger.error("크롤링 수동 실행 실패:", error);
return res.status(500).json({ success: false, message: error.message });
}
}
// 실행 로그 조회
export async function getCrawlLogs(req: AuthenticatedRequest, res: Response) {
try {
const limit = parseInt(req.query.limit as string) || 20;
const logs = await CrawlService.getLogs(req.params.id, limit);
return res.json({ success: true, data: logs });
} catch (error: any) {
logger.error("크롤링 로그 조회 실패:", error);
return res.status(500).json({ success: false, message: error.message });
}
}