| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497 |
- import { writeFile } from 'fs/promises';
- import CommonContent, { GetContentListParams, type GetContentDetailItem, type GetContentListItem } from '../../api/CommonContent';
- import ProjectsContent from '../../api/inheritor/ProjectsContent';
- import InheritorContent from '../../api/inheritor/InheritorContent';
- import SeminarContent from '../../api/inheritor/SeminarContent';
- import UnitContent from '../../api/inheritor/UnitContent';
- import UnmoveableContent from '../../api/inheritor/UnmoveableContent';
- import path from 'path';
- import fs from 'fs';
- import { argv, cwd } from 'process';
- const data = [] as Array<GetContentListItem & { detail?: GetContentDetailItem }>;
- // HTML转Markdown的简单实现
- function htmlToMarkdown(html: string): string {
- if (!html) return '';
-
- // 处理标题
- html = html.replace(/<h1[^>]*>(.*?)<\/h1>/gi, '# $1\n\n');
- html = html.replace(/<h2[^>]*>(.*?)<\/h2>/gi, '## $1\n\n');
- html = html.replace(/<h3[^>]*>(.*?)<\/h3>/gi, '### $1\n\n');
-
- // 处理段落
- html = html.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1');
-
- // 处理加粗
- html = html.replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**');
- html = html.replace(/<b[^>]*>(.*?)<\/b>/gi, '**$1**');
-
- // 处理斜体
- html = html.replace(/<em[^>]*>(.*?)<\/em>/gi, '*$1*');
- html = html.replace(/<i[^>]*>(.*?)<\/i>/gi, '*$1*');
-
- // 处理列表
- html = html.replace(/<ul[^>]*>([\s\S]*?)<\/ul>/gi, (match, content) => {
- return content.replace(/<li[^>]*>(.*?)<\/li>/gi, '- $1\n') + '\n';
- });
-
- html = html.replace(/<ol[^>]*>([\s\S]*?)<\/ol>/gi, (match, content, index) => {
- let count = 1;
- return content.replace(/<li[^>]*>(.*?)<\/li>/gi, () => {
- return `${count++}. $1\n`;
- }) + '\n';
- });
-
- // 处理图片
- html = html.replace(/<img[^>]*src="([^"]*)"[^>]*alt="([^"]*)"[^>]*>/gi, '');
-
- // 处理链接
- html = html.replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)');
-
- // 处理换行
- html = html.replace(/<br[^>]*>/gi, ' ');
-
- // 去除所有HTML标签
- html = html.replace(/<[^>]*>/g, '');
-
- // 处理多余的换行
- html = html.replace(/\n\s*\n/g, '\n\n');
-
- return html.trim();
- }
- function sanitizeFileNameAdvanced(fileName: string, replacement = '_') {
- if (typeof fileName !== 'string') return '';
-
- // 1. 定义跨平台非法字符正则(核心非法字符+不可见控制字符)
- const illegalRegex = /[<>:"/\\|?*\x00-\x1F]/g; // \x00-\x1F 是不可见ASCII控制字符
- // 2. 定义Windows保留文件名(不区分大小写)
- const windowsReservedNames = new Set([
- 'con', 'nul', 'prn', 'aux', 'com1', 'com2', 'com3', 'com4',
- 'com5', 'com6', 'com7', 'com8', 'com9', 'lpt1', 'lpt2',
- 'lpt3', 'lpt4', 'lpt5', 'lpt6', 'lpt7', 'lpt8', 'lpt9'
- ]);
-
- // 步骤1:替换非法字符
- let safeFileName = fileName.replace(illegalRegex, replacement);
-
- // 步骤2:清理首尾空格、句点,压缩连续替换符(避免多个下划线连在一起)
- safeFileName = safeFileName
- .trim()
- .replace(/\.$/, '') // 移除结尾句点
- .replace(new RegExp(`${replacement}+`, 'g'), replacement); // 压缩连续替换符
-
- // 步骤3:处理Windows保留文件名
- const fileNameWithoutExt = safeFileName.split('.')[0].toLowerCase();
- if (windowsReservedNames.has(fileNameWithoutExt)) {
- safeFileName = `${safeFileName}_${Date.now()}`; // 追加时间戳避免冲突
- }
-
- // 步骤4:限制文件名长度(Windows最大255字符,这里取200字符留有余地)
- const maxFileNameLength = 200;
- if (safeFileName.length > maxFileNameLength) {
- const ext = safeFileName.includes('.') ? safeFileName.split('.').pop() : '';
- const name = safeFileName.slice(0, maxFileNameLength - (ext ? ext.length + 1 : 0));
- safeFileName = ext ? `${name}.${ext}` : name;
- }
-
- // 步骤5:兜底空文件名
- return safeFileName || 'unnamed_file';
- }
- // 生成Markdown文本
- async function generateMarkdownIch(subDir: string, type: string) {
-
- for (const item of data) {
-
- let md = '---\n';
-
- function addMeta(key: string, value: any) {
- if (value)
- md += `${key}: ${value}\n`;
- }
- addMeta('level', item.levelText);
- addMeta('crType', item.crTypeText);
- addMeta('region', item.regionText);
- addMeta('batch', item.batchText);
- addMeta('ichType', item.ichTypeText);
- addMeta('type', type);
- addMeta('unit', item.detail?.unit);
- addMeta('name', item.title);
- addMeta('address', item.address);
- md += `---\n\n`;
- // 基本信息
- md += `# ${item.title}\n\n`;
- if (item.desc)
- md += `${item.desc}\n\n`;
- md += `${item.title}类型:${type}\n`;
- function addRow(key: string, value: any) {
- if (value)
- md += `- ${key}: ${value}\n`;
- }
- addRow(`${item.title}非遗级别`, item.levelText);
- addRow(`${item.title}非遗类别`, item.ichTypeText);
- addRow(`${item.title}地区`, item.regionText);
- addRow(`${item.title}批次`, item.batchText);
- addRow(`${item.title}保护单位`, item.unit);
- addRow(`${item.title}地址`, item.address);
- addRow(`${item.title}字号名称`, item.fontName);
- addRow(`${item.title}认定类型`, item.brandType);
- md += `${item.title}数据库索引ID: 类型: intangible ID: ${item.id || '无'}\n\n`;
-
- // 详细信息
- if (item.detail) {
- const detail = item.detail as GetContentDetailItem;
- // 简介
- if (detail.intro)
- md += htmlToMarkdown(detail.intro) + '\n\n';
- // 内容
- if (detail.content)
- md += htmlToMarkdown(detail.content) + '\n\n';
-
- // 传承人
- if (detail.inheritorsList && detail.inheritorsList.length > 0) {
- md += `${item.title}相关非遗传承人:${detail.inheritorsList.map(inheritor => inheritor.title).join('、')}\n\n`;
- if (detail.inheritor)
- md += htmlToMarkdown(detail.inheritor) + '\n\n';
- }
-
- // 传习所
- if (detail.ichSitesList && detail.ichSitesList.length > 0)
- md += `${item.title}相关非遗传习所: ${detail.ichSitesList.map(site => site.title).join('、')}\n\n`;
- // 同级别项目
- if (detail.otherLevel && detail.otherLevel.length > 0)
- md += `${item.title}其他级别: ` + detail.otherLevel.map(project => project.levelText).join('、') + `\n\n`;
- // 传承谱系
- if (detail.pedigree) {
- md += `${item.title}的传承谱系: ` + htmlToMarkdown(detail.pedigree as string) + '\n\n';
- }
- }
- try {
- await writeFile(path.join(subDir, `${sanitizeFileNameAdvanced(item.title) || item.id}.md`), md);
- } catch (err) {
- console.error(`写入文件 ${item.title || item.id}.md 失败:`, err);
- }
- }
- }
- async function generateMarkdownInheritor(subDir: string) {
-
- for (const item of data) {
-
- let md = '---\n';
-
- function addMeta(key: string, value: any) {
- if (value)
- md += `${key}: ${value}\n`;
- }
- addMeta('level', item.levelText);
- addMeta('type', '非遗传承人');
- addMeta('unit', item.detail?.unit);
- addMeta('sex', item.detail?.gender == '1'? '男' : '女');
- addMeta('name', item.title);
- md += `---\n\n`;
- // 基本信息
- md += `# ${item.title}\n\n`;
- if (item.desc)
- md += `${item.desc}\n\n`;
- md += `## 基本信息\n\n`;
- md += `${item.title}类型:非遗传承人\n`;
- function addRow(key: string, value: any) {
- if (value)
- md += `- ${key}: ${value}\n`;
- }
- addRow('民族', item.detail?.nation);
- addRow('性别', item.detail?.gender == '1'? '男' : '女');
- addRow('出生日期', item.detail?.dateBirth);
- addRow('出生地区', item.detail?.birthplace);
- addRow('单位', item.detail?.unit);
- addRow('传承项目', item.detail?.associationMeList[0]?.title);
- addRow('传承人级别', item.detail?.batchText);
- addRow('公布批次', item.detail?.batchText);
- md += `\n## ${item.title}数据库索引ID\n\n`;
- md += `- 类型: inheritor\n`;
- md += `- ID: ${item.id || '无'}\n\n`;
-
- // 详细信息
- if (item.detail) {
- const detail = item.detail as GetContentDetailItem;
- // 简介
- if (detail.intro) {
- md += `## ${item.title}简介\n\n`;
- md += htmlToMarkdown(detail.intro) + '\n\n';
- }
- if (detail.content) {
- md += htmlToMarkdown(detail.content) + '\n\n';
- }
- // 奖项
- if (detail.prize) {
- md += htmlToMarkdown(detail.prize as string) + '\n\n';
- }
- // 相关项目
- if (detail.associationMeList && detail.associationMeList.length > 0)
- md += `${item.title}相关项目${detail.associationMeList.map(project => project.title).join('、')}\n\n`;
- // 传习所
- if (detail.ichSitesList && detail.ichSitesList.length > 0)
- md += `${item.title}相关非遗传习所: ${detail.ichSitesList.map(site => site.title).join('、')}\n\n`;
- }
- await writeFile(path.join(subDir, `${sanitizeFileNameAdvanced(item.title) ||item.id}.md`), md);
- }
- }
- async function generateMarkdownArtifact(subDir: string) {
-
- for (const item of data) {
-
- let md = '---\n';
-
- function addMeta(key: string, value: any) {
- if (value)
- md += `${key}: ${value}\n`;
- }
- addMeta('name', item.title);
- addMeta('crType', item.crTypeText);
- addMeta('region', item.regionText);
- addMeta('level', item.levelText);
- addMeta('address', item.address);
- addMeta('age', item.age);
- addMeta('type', '非遗文物');
- addMeta('unit', item.detail?.unit);
- md += `---\n\n`;
- // 基本信息
- md += `# ${item.title}\n\n`;
- if (item.desc)
- md += `${item.desc}\n\n`;
- md += `## 基本信息\n\n`;
- md += `类型:非遗文物\n`;
- function addRow(key: string, value: any) {
- if (value)
- md += `- ${key}: ${value}\n`;
- }
- addRow('开放时间', item.detail?.openStatusText);
- addRow('年代', item.age);
- addRow('级别', item.levelText);
- addRow('所属区域', item.regionText);
- addRow('文物类型', item.crTypeText);
- addRow('单位', item.detail?.unit);
- md += `\n## 数据库索引ID\n\n`;
- md += `- 类型: artifact\n`;
- md += `- ID: ${item.id || '无'}\n\n`;
- if (item.video) {
- md += `## 视频\n\n`;
- md += `\n\n`;
- }
-
- // 详细信息
- if (item.detail) {
- const detail = item.detail as GetContentDetailItem;
- // 简介
- if (detail.intro) {
- md += `## 简介\n\n`;
- md += htmlToMarkdown(detail.intro) + '\n\n';
- }
- if (detail.content) {
- md += `## 详情\n\n`;
- md += htmlToMarkdown(detail.content) + '\n\n';
- }
- // 奖项
- if (detail.protectedArea) {
- md += `## 保护范围\n\n`;
- md += htmlToMarkdown(detail.protectedArea as string) + '\n\n';
- }
- if (detail.environment) {
- md += `## 建筑环境\n\n`;
- md += htmlToMarkdown(detail.environment as string) + '\n\n';
- }
- if (detail.价值评估) {
- md += `## 价值评估\n\n`;
- md += htmlToMarkdown(detail.价值评估 as string) + '\n\n';
- }
- }
- await writeFile(path.join(subDir, `${sanitizeFileNameAdvanced(item.title) ||item.id}.md`), md);
- }
- }
- async function main() {
- const type = argv[2];
- function makeDir(nanme: string) {
- const dir = path.join(cwd(), `dist/${nanme}`);
- if (!fs.existsSync(dir))
- fs.mkdirSync(dir, { recursive: true });
- return dir;
- }
- switch (type) {
- case 'ich': {
- const dir = makeDir('ich');
- (await ProjectsContent.getContentList(new GetContentListParams(), 1, 1000)).list.forEach(item => {
- data.push(item);
- });
- for (const item of data)
- item.detail = (await ProjectsContent.getContentDetail(item.id)) as GetContentDetailItem;
- generateMarkdownIch(dir, '非遗项目');
- break;
- }
- case 'seminar': {
- const dir = makeDir('seminar');
- (await SeminarContent.getContentList(new GetContentListParams(), 1, 1000)).list.forEach(item => {
- data.push(item);
- });
- for (const item of data)
- item.detail = (await SeminarContent.getContentDetail(item.id)) as GetContentDetailItem;
- generateMarkdownIch(dir, '非遗传习所');
- break;
- }
- case 'old': {
- const dir = makeDir('old');
- (await CommonContent.getContentList(new GetContentListParams()
- .setModelId(17)
- .setMainBodyColumnId(312)
- , 1, 1000)).list.forEach(item => {
- data.push(item);
- });
- for (const item of data)
- item.detail = (await CommonContent.getContentDetail(item.id)) as GetContentDetailItem;
- generateMarkdownIch(dir, '老字号');
- break;
- }
- case 'unit': {
- const dir = makeDir('unit');
- (await UnitContent.getContentList(new GetContentListParams(), 1, 1000)).list.forEach(item => {
- data.push(item);
- });
- for (const item of data)
- item.detail = (await UnitContent.getContentDetail(item.id)) as GetContentDetailItem;
- generateMarkdownIch(dir, '非遗保护单位');
- break;
- }
- case 'inheritor': {
- const dir = makeDir('inheritor');
- (await InheritorContent.getContentList(new GetContentListParams(), 1, 1000)).list.forEach(item => {
- data.push(item);
- });
- for (const item of data)
- item.detail = (await InheritorContent.getContentDetail(item.id)) as GetContentDetailItem;
- generateMarkdownInheritor(dir);
- break;
- }
- case 'artifact': {
- const dir = makeDir('artifact');
- (await UnmoveableContent.getContentList(new GetContentListParams(), 1, 1000)).list.forEach(item => {
- data.push(item);
- });
- for (const item of data)
- item.detail = (await UnmoveableContent.getContentDetail(item.id)) as GetContentDetailItem;
- generateMarkdownArtifact(dir);
- break;
- }
- case 'publishToDify': {
- const datasetId = argv[3];
- const localDir = argv[4];
- if (!datasetId || !localDir) {
- console.error('请提供 datasetId 和 localDir 参数');
- console.error('用法: node ich.js publishToDify <datasetId> <localDir>');
- break;
- }
- await publishToDify(datasetId, localDir);
- break;
- }
- default:
- console.log('不支持的类型');
- break;
- }
- }
- async function publishToDify(datasetId: string, localDir: string) {
- // Dify API 配置
- const DIFY_API_KEY = process.env.DIFY_API_KEY || 'dataset-ZELjB79MnbcvCeyjEVMEuTmB';
- const DIFY_API_URL = 'http://localhost:8089/v1/datasets';
- if (!DIFY_API_KEY) {
- console.error('请设置 DIFY_API_KEY 环境变量');
- return;
- }
- if (!fs.existsSync(localDir)) {
- console.error(`本地目录不存在: ${localDir}`);
- return;
- }
- console.log(`开始上传文档到 Dify 知识库 (ID: ${datasetId})...`);
- console.log(`本地目录: ${localDir}`);
- try {
- // 读取本地目录中的所有 md 文件
- const files = fs.readdirSync(localDir);
- const mdFiles = files.filter(file => file.endsWith('.md'));
- if (mdFiles.length === 0) {
- console.log('目录中没有找到 .md 文件');
- return;
- }
- console.log(`找到 ${mdFiles.length} 个 .md 文件`);
- // 逐个上传文件
- for (const file of mdFiles) {
- console.log(`正在上传: ${file}`);
- const filePath = path.join(localDir, file);
- const content = fs.readFileSync(filePath, 'utf8');
- // 构建请求数据
- const requestData = JSON.stringify({
- name: file,
- text: content,
- indexing_technique: 'high_quality',
- doc_language: 'zh',
- });
- // 发送上传请求
- const response = await fetch(`${DIFY_API_URL}/${datasetId}/document/create-by-text`, {
- method: 'POST',
- headers: {
- 'Content-Type': 'application/json',
- 'Authorization': `Bearer ${DIFY_API_KEY}`
- },
- body: requestData
- });
- if (response.ok) {
- await response.json();
- console.log(`✅ 上传成功: ${file})`);
- } else {
- const error = await response.json();
- console.error(`❌ 上传失败: ${file}`);
- console.error(` 错误信息: ${(error as Error).message || response.statusText}`);
- }
- }
- console.log('\n🎉 所有文档上传完成!');
- } catch (error) {
- console.error('上传过程中发生错误:', error);
- }
- }
- main();
|