import { writeFile } from 'fs/promises'; import CommonContent, { GetContentListParams, type GetContentDetailItem, type GetContentListItem } from '../../api/CommonContent'; import ProjectsContent from '../../api/inheritor/ProjectsContent'; import InheritorContent from '../../api/inheritor/InheritorContent'; import SeminarContent from '../../api/inheritor/SeminarContent'; import UnitContent from '../../api/inheritor/UnitContent'; import UnmoveableContent from '../../api/inheritor/UnmoveableContent'; import path from 'path'; import fs from 'fs'; import { argv, cwd } from 'process'; const data = [] as Array; // HTML转Markdown的简单实现 function htmlToMarkdown(html: string): string { if (!html) return ''; // 处理标题 html = html.replace(/]*>(.*?)<\/h1>/gi, '# $1\n\n'); html = html.replace(/]*>(.*?)<\/h2>/gi, '## $1\n\n'); html = html.replace(/]*>(.*?)<\/h3>/gi, '### $1\n\n'); // 处理段落 html = html.replace(/]*>(.*?)<\/p>/gi, '$1'); // 处理加粗 html = html.replace(/]*>(.*?)<\/strong>/gi, '**$1**'); html = html.replace(/]*>(.*?)<\/b>/gi, '**$1**'); // 处理斜体 html = html.replace(/]*>(.*?)<\/em>/gi, '*$1*'); html = html.replace(/]*>(.*?)<\/i>/gi, '*$1*'); // 处理列表 html = html.replace(/]*>([\s\S]*?)<\/ul>/gi, (match, content) => { return content.replace(/]*>(.*?)<\/li>/gi, '- $1\n') + '\n'; }); html = html.replace(/]*>([\s\S]*?)<\/ol>/gi, (match, content, index) => { let count = 1; return content.replace(/]*>(.*?)<\/li>/gi, () => { return `${count++}. $1\n`; }) + '\n'; }); // 处理图片 html = html.replace(/]*src="([^"]*)"[^>]*alt="([^"]*)"[^>]*>/gi, '![$2]($1)'); // 处理链接 html = html.replace(/]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)'); // 处理换行 html = html.replace(/]*>/gi, ' '); // 去除所有HTML标签 html = html.replace(/<[^>]*>/g, ''); // 处理多余的换行 html = html.replace(/\n\s*\n/g, '\n\n'); return html.trim(); } function sanitizeFileNameAdvanced(fileName: string, replacement = '_') { if (typeof fileName !== 'string') return ''; // 1. 定义跨平台非法字符正则(核心非法字符+不可见控制字符) const illegalRegex = /[<>:"/\\|?*\x00-\x1F]/g; // \x00-\x1F 是不可见ASCII控制字符 // 2. 定义Windows保留文件名(不区分大小写) const windowsReservedNames = new Set([ 'con', 'nul', 'prn', 'aux', 'com1', 'com2', 'com3', 'com4', 'com5', 'com6', 'com7', 'com8', 'com9', 'lpt1', 'lpt2', 'lpt3', 'lpt4', 'lpt5', 'lpt6', 'lpt7', 'lpt8', 'lpt9' ]); // 步骤1:替换非法字符 let safeFileName = fileName.replace(illegalRegex, replacement); // 步骤2:清理首尾空格、句点,压缩连续替换符(避免多个下划线连在一起) safeFileName = safeFileName .trim() .replace(/\.$/, '') // 移除结尾句点 .replace(new RegExp(`${replacement}+`, 'g'), replacement); // 压缩连续替换符 // 步骤3:处理Windows保留文件名 const fileNameWithoutExt = safeFileName.split('.')[0].toLowerCase(); if (windowsReservedNames.has(fileNameWithoutExt)) { safeFileName = `${safeFileName}_${Date.now()}`; // 追加时间戳避免冲突 } // 步骤4:限制文件名长度(Windows最大255字符,这里取200字符留有余地) const maxFileNameLength = 200; if (safeFileName.length > maxFileNameLength) { const ext = safeFileName.includes('.') ? safeFileName.split('.').pop() : ''; const name = safeFileName.slice(0, maxFileNameLength - (ext ? ext.length + 1 : 0)); safeFileName = ext ? `${name}.${ext}` : name; } // 步骤5:兜底空文件名 return safeFileName || 'unnamed_file'; } // 生成Markdown文本 async function generateMarkdownIch(subDir: string, type: string) { for (const item of data) { let md = '---\n'; function addMeta(key: string, value: any) { if (value) md += `${key}: ${value}\n`; } addMeta('level', item.levelText); addMeta('crType', item.crTypeText); addMeta('region', item.regionText); addMeta('batch', item.batchText); addMeta('ichType', item.ichTypeText); addMeta('type', type); addMeta('unit', item.detail?.unit); addMeta('name', item.title); addMeta('address', item.address); md += `---\n\n`; // 基本信息 md += `# ${item.title}\n\n`; if (item.desc) md += `${item.desc}\n\n`; md += `${item.title}类型:${type}\n`; function addRow(key: string, value: any) { if (value) md += `- ${key}: ${value}\n`; } addRow(`${item.title}非遗级别`, item.levelText); addRow(`${item.title}非遗类别`, item.ichTypeText); addRow(`${item.title}地区`, item.regionText); addRow(`${item.title}批次`, item.batchText); addRow(`${item.title}保护单位`, item.unit); addRow(`${item.title}地址`, item.address); addRow(`${item.title}字号名称`, item.fontName); addRow(`${item.title}认定类型`, item.brandType); md += `${item.title}数据库索引ID: 类型: intangible ID: ${item.id || '无'}\n\n`; // 详细信息 if (item.detail) { const detail = item.detail as GetContentDetailItem; // 简介 if (detail.intro) md += htmlToMarkdown(detail.intro) + '\n\n'; // 内容 if (detail.content) md += htmlToMarkdown(detail.content) + '\n\n'; // 传承人 if (detail.inheritorsList && detail.inheritorsList.length > 0) { md += `${item.title}相关非遗传承人:${detail.inheritorsList.map(inheritor => inheritor.title).join('、')}\n\n`; if (detail.inheritor) md += htmlToMarkdown(detail.inheritor) + '\n\n'; } // 传习所 if (detail.ichSitesList && detail.ichSitesList.length > 0) md += `${item.title}相关非遗传习所: ${detail.ichSitesList.map(site => site.title).join('、')}\n\n`; // 同级别项目 if (detail.otherLevel && detail.otherLevel.length > 0) md += `${item.title}其他级别: ` + detail.otherLevel.map(project => project.levelText).join('、') + `\n\n`; // 传承谱系 if (detail.pedigree) { md += `${item.title}的传承谱系: ` + htmlToMarkdown(detail.pedigree as string) + '\n\n'; } } try { await writeFile(path.join(subDir, `${sanitizeFileNameAdvanced(item.title) || item.id}.md`), md); } catch (err) { console.error(`写入文件 ${item.title || item.id}.md 失败:`, err); } } } async function generateMarkdownInheritor(subDir: string) { for (const item of data) { let md = '---\n'; function addMeta(key: string, value: any) { if (value) md += `${key}: ${value}\n`; } addMeta('level', item.levelText); addMeta('type', '非遗传承人'); addMeta('unit', item.detail?.unit); addMeta('sex', item.detail?.gender == '1'? '男' : '女'); addMeta('name', item.title); md += `---\n\n`; // 基本信息 md += `# ${item.title}\n\n`; if (item.desc) md += `${item.desc}\n\n`; md += `## 基本信息\n\n`; md += `${item.title}类型:非遗传承人\n`; function addRow(key: string, value: any) { if (value) md += `- ${key}: ${value}\n`; } addRow('民族', item.detail?.nation); addRow('性别', item.detail?.gender == '1'? '男' : '女'); addRow('出生日期', item.detail?.dateBirth); addRow('出生地区', item.detail?.birthplace); addRow('单位', item.detail?.unit); addRow('传承项目', item.detail?.associationMeList[0]?.title); addRow('传承人级别', item.detail?.batchText); addRow('公布批次', item.detail?.batchText); md += `\n## ${item.title}数据库索引ID\n\n`; md += `- 类型: inheritor\n`; md += `- ID: ${item.id || '无'}\n\n`; // 详细信息 if (item.detail) { const detail = item.detail as GetContentDetailItem; // 简介 if (detail.intro) { md += `## ${item.title}简介\n\n`; md += htmlToMarkdown(detail.intro) + '\n\n'; } if (detail.content) { md += htmlToMarkdown(detail.content) + '\n\n'; } // 奖项 if (detail.prize) { md += htmlToMarkdown(detail.prize as string) + '\n\n'; } // 相关项目 if (detail.associationMeList && detail.associationMeList.length > 0) md += `${item.title}相关项目${detail.associationMeList.map(project => project.title).join('、')}\n\n`; // 传习所 if (detail.ichSitesList && detail.ichSitesList.length > 0) md += `${item.title}相关非遗传习所: ${detail.ichSitesList.map(site => site.title).join('、')}\n\n`; } await writeFile(path.join(subDir, `${sanitizeFileNameAdvanced(item.title) ||item.id}.md`), md); } } async function generateMarkdownArtifact(subDir: string) { for (const item of data) { let md = '---\n'; function addMeta(key: string, value: any) { if (value) md += `${key}: ${value}\n`; } addMeta('name', item.title); addMeta('crType', item.crTypeText); addMeta('region', item.regionText); addMeta('level', item.levelText); addMeta('address', item.address); addMeta('age', item.age); addMeta('type', '非遗文物'); addMeta('unit', item.detail?.unit); md += `---\n\n`; // 基本信息 md += `# ${item.title}\n\n`; if (item.desc) md += `${item.desc}\n\n`; md += `## 基本信息\n\n`; md += `类型:非遗文物\n`; function addRow(key: string, value: any) { if (value) md += `- ${key}: ${value}\n`; } addRow('开放时间', item.detail?.openStatusText); addRow('年代', item.age); addRow('级别', item.levelText); addRow('所属区域', item.regionText); addRow('文物类型', item.crTypeText); addRow('单位', item.detail?.unit); md += `\n## 数据库索引ID\n\n`; md += `- 类型: artifact\n`; md += `- ID: ${item.id || '无'}\n\n`; if (item.video) { md += `## 视频\n\n`; md += `![${item.title}视频](${item.video})\n\n`; } // 详细信息 if (item.detail) { const detail = item.detail as GetContentDetailItem; // 简介 if (detail.intro) { md += `## 简介\n\n`; md += htmlToMarkdown(detail.intro) + '\n\n'; } if (detail.content) { md += `## 详情\n\n`; md += htmlToMarkdown(detail.content) + '\n\n'; } // 奖项 if (detail.protectedArea) { md += `## 保护范围\n\n`; md += htmlToMarkdown(detail.protectedArea as string) + '\n\n'; } if (detail.environment) { md += `## 建筑环境\n\n`; md += htmlToMarkdown(detail.environment as string) + '\n\n'; } if (detail.价值评估) { md += `## 价值评估\n\n`; md += htmlToMarkdown(detail.价值评估 as string) + '\n\n'; } } await writeFile(path.join(subDir, `${sanitizeFileNameAdvanced(item.title) ||item.id}.md`), md); } } async function main() { const type = argv[2]; function makeDir(nanme: string) { const dir = path.join(cwd(), `dist/${nanme}`); if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true }); return dir; } switch (type) { case 'ich': { const dir = makeDir('ich'); (await ProjectsContent.getContentList(new GetContentListParams(), 1, 1000)).list.forEach(item => { data.push(item); }); for (const item of data) item.detail = (await ProjectsContent.getContentDetail(item.id)) as GetContentDetailItem; generateMarkdownIch(dir, '非遗项目'); break; } case 'seminar': { const dir = makeDir('seminar'); (await SeminarContent.getContentList(new GetContentListParams(), 1, 1000)).list.forEach(item => { data.push(item); }); for (const item of data) item.detail = (await SeminarContent.getContentDetail(item.id)) as GetContentDetailItem; generateMarkdownIch(dir, '非遗传习所'); break; } case 'old': { const dir = makeDir('old'); (await CommonContent.getContentList(new GetContentListParams() .setModelId(17) .setMainBodyColumnId(312) , 1, 1000)).list.forEach(item => { data.push(item); }); for (const item of data) item.detail = (await CommonContent.getContentDetail(item.id)) as GetContentDetailItem; generateMarkdownIch(dir, '老字号'); break; } case 'unit': { const dir = makeDir('unit'); (await UnitContent.getContentList(new GetContentListParams(), 1, 1000)).list.forEach(item => { data.push(item); }); for (const item of data) item.detail = (await UnitContent.getContentDetail(item.id)) as GetContentDetailItem; generateMarkdownIch(dir, '非遗保护单位'); break; } case 'inheritor': { const dir = makeDir('inheritor'); (await InheritorContent.getContentList(new GetContentListParams(), 1, 1000)).list.forEach(item => { data.push(item); }); for (const item of data) item.detail = (await InheritorContent.getContentDetail(item.id)) as GetContentDetailItem; generateMarkdownInheritor(dir); break; } case 'artifact': { const dir = makeDir('artifact'); (await UnmoveableContent.getContentList(new GetContentListParams(), 1, 1000)).list.forEach(item => { data.push(item); }); for (const item of data) item.detail = (await UnmoveableContent.getContentDetail(item.id)) as GetContentDetailItem; generateMarkdownArtifact(dir); break; } case 'publishToDify': { const datasetId = argv[3]; const localDir = argv[4]; if (!datasetId || !localDir) { console.error('请提供 datasetId 和 localDir 参数'); console.error('用法: node ich.js publishToDify '); break; } await publishToDify(datasetId, localDir); break; } default: console.log('不支持的类型'); break; } } async function publishToDify(datasetId: string, localDir: string) { // Dify API 配置 const DIFY_API_KEY = process.env.DIFY_API_KEY || 'dataset-ZELjB79MnbcvCeyjEVMEuTmB'; const DIFY_API_URL = 'http://localhost:8089/v1/datasets'; if (!DIFY_API_KEY) { console.error('请设置 DIFY_API_KEY 环境变量'); return; } if (!fs.existsSync(localDir)) { console.error(`本地目录不存在: ${localDir}`); return; } console.log(`开始上传文档到 Dify 知识库 (ID: ${datasetId})...`); console.log(`本地目录: ${localDir}`); try { // 读取本地目录中的所有 md 文件 const files = fs.readdirSync(localDir); const mdFiles = files.filter(file => file.endsWith('.md')); if (mdFiles.length === 0) { console.log('目录中没有找到 .md 文件'); return; } console.log(`找到 ${mdFiles.length} 个 .md 文件`); // 逐个上传文件 for (const file of mdFiles) { console.log(`正在上传: ${file}`); const filePath = path.join(localDir, file); const content = fs.readFileSync(filePath, 'utf8'); // 构建请求数据 const requestData = JSON.stringify({ name: file, text: content, indexing_technique: 'high_quality', doc_language: 'zh', }); // 发送上传请求 const response = await fetch(`${DIFY_API_URL}/${datasetId}/document/create-by-text`, { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${DIFY_API_KEY}` }, body: requestData }); if (response.ok) { await response.json(); console.log(`✅ 上传成功: ${file})`); } else { const error = await response.json(); console.error(`❌ 上传失败: ${file}`); console.error(` 错误信息: ${(error as Error).message || response.statusText}`); } } console.log('\n🎉 所有文档上传完成!'); } catch (error) { console.error('上传过程中发生错误:', error); } } main();