快乐的梦鱼 il y a 3 semaines
Parent
commit
fd49535d81
5 fichiers modifiés avec 212 ajouts et 88 suppressions
  1. 3 3
      api/RequestModules.ts
  2. 199 82
      scripts/ExportToAi/ich.ts
  3. 5 2
      scripts/ExportToAi/system.txt
  4. 1 1
      tsconfig.ich.json
  5. 4 0
      webpack.ich.config.js

+ 3 - 3
api/RequestModules.ts

@@ -19,7 +19,7 @@ import {
   appendPostParams,
 } from "@imengyu/imengyu-utils";
 import type { DataModel, KeyValue, NewDataModel } from "@imengyu/js-request-transform";
-import { useAuthStore } from "@/stores/auth";
+//import { useAuthStore } from "@/stores/auth";
 import { Modal } from "ant-design-vue";
 
 /**
@@ -40,11 +40,11 @@ function matchNotReportMessage(str: string) {
 //请求拦截器
 function requestInceptor(url: string, req: RequestOptions) {
   //获取store中的token,追加到头;
-  const autoStore = useAuthStore();
+  /*const autoStore = useAuthStore();
   if (StringUtils.isNullOrEmpty((req.header as KeyValue).token as string)) {
     req.header['token'] = autoStore.token;
     req.header['__token__'] = autoStore.token;
-  }
+  }*/
   if (req.method == 'GET') {
     //追加GET参数
     url = appendGetUrlParams(url, 'main_body_id', ApiCofig.mainBodyId);

+ 199 - 82
scripts/ExportToAi/ich.ts

@@ -22,7 +22,7 @@ function htmlToMarkdown(html: string): string {
   html = html.replace(/<h3[^>]*>(.*?)<\/h3>/gi, '### $1\n\n');
   
   // 处理段落
-  html = html.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n');
+  html = html.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1');
   
   // 处理加粗
   html = html.replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**');
@@ -51,7 +51,7 @@ function htmlToMarkdown(html: string): string {
   html = html.replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)');
   
   // 处理换行
-  html = html.replace(/<br[^>]*>/gi, '\n');
+  html = html.replace(/<br[^>]*>/gi, ' ');
   
   // 去除所有HTML标签
   html = html.replace(/<[^>]*>/g, '');
@@ -61,17 +61,73 @@ function htmlToMarkdown(html: string): string {
   
   return html.trim();
 }
+function sanitizeFileNameAdvanced(fileName: string, replacement = '_') {
+  if (typeof fileName !== 'string') return '';
+  
+  // 1. 定义跨平台非法字符正则(核心非法字符+不可见控制字符)
+  const illegalRegex = /[<>:"/\\|?*\x00-\x1F]/g; // \x00-\x1F 是不可见ASCII控制字符
+  // 2. 定义Windows保留文件名(不区分大小写)
+  const windowsReservedNames = new Set([
+    'con', 'nul', 'prn', 'aux', 'com1', 'com2', 'com3', 'com4',
+    'com5', 'com6', 'com7', 'com8', 'com9', 'lpt1', 'lpt2',
+    'lpt3', 'lpt4', 'lpt5', 'lpt6', 'lpt7', 'lpt8', 'lpt9'
+  ]);
+  
+  // 步骤1:替换非法字符
+  let safeFileName = fileName.replace(illegalRegex, replacement);
+  
+  // 步骤2:清理首尾空格、句点,压缩连续替换符(避免多个下划线连在一起)
+  safeFileName = safeFileName
+    .trim()
+    .replace(/\.$/, '') // 移除结尾句点
+    .replace(new RegExp(`${replacement}+`, 'g'), replacement); // 压缩连续替换符
+  
+  // 步骤3:处理Windows保留文件名
+  const fileNameWithoutExt = safeFileName.split('.')[0].toLowerCase();
+  if (windowsReservedNames.has(fileNameWithoutExt)) {
+    safeFileName = `${safeFileName}_${Date.now()}`; // 追加时间戳避免冲突
+  }
+  
+  // 步骤4:限制文件名长度(Windows最大255字符,这里取200字符留有余地)
+  const maxFileNameLength = 200;
+  if (safeFileName.length > maxFileNameLength) {
+    const ext = safeFileName.includes('.') ? safeFileName.split('.').pop() : '';
+    const name = safeFileName.slice(0, maxFileNameLength - (ext ? ext.length + 1 : 0));
+    safeFileName = ext ? `${name}.${ext}` : name;
+  }
+  
+  // 步骤5:兜底空文件名
+  return safeFileName || 'unnamed_file';
+}
+
 // 生成Markdown文本
 async function generateMarkdownIch(subDir: string, type: string) {
   
   for (const item of data) {
     
-    let md = '';
+    let md = '---\n';    
+    
+    function addMeta(key: string, value: any) {
+      if (value)
+        md += `${key}: ${value}\n`;
+    }
+
+    addMeta('level', item.levelText);
+    addMeta('crType', item.crTypeText);
+    addMeta('region', item.regionText);
+    addMeta('batch', item.batchText);
+    addMeta('ichType', item.ichTypeText);
+    addMeta('type', type);
+    addMeta('unit', item.detail?.unit);
+    addMeta('name', item.title);
+    addMeta('address', item.address);
+
+    md += `---\n\n`;
+
     // 基本信息
     md += `# ${item.title}\n\n`;
     if (item.desc)
       md += `${item.desc}\n\n`;
-    md += `## 基本信息\n\n`;
     md += `类型:${type}\n\n`;
 
     function addRow(key: string, value: any) {
@@ -79,100 +135,74 @@ async function generateMarkdownIch(subDir: string, type: string) {
         md += `- ${key}: ${value}\n`;
     }
 
-    addRow('级别', item.levelText);
-    addRow('类别', item.ichTypeText);
+    addRow('非遗级别', item.levelText);
+    addRow('非遗类别', item.ichTypeText);
     addRow('地区', item.regionText);
     addRow('批次', item.batchText);
     addRow('保护单位', item.unit);
     addRow('地址', item.address);
     addRow('字号名称', item.fontName);
     addRow('认定类型', item.brandType);
-    addRow('其他级别保护单位', item.detail?.otherLevel && item.detail.otherLevel.length > 0 ? `${item.detail.otherLevel.length}个` : '');
     
-    md += `\n## 数据库索引ID\n\n`;
-    md += `- 类型: intangible\n`;
-    md += `- ID: ${item.id || '无'}\n\n`;
+    md += `数据库索引ID: 类型: intangible ID: ${item.id || '无'}\n\n`;
     
     // 详细信息
     if (item.detail) {
       const detail = item.detail as GetContentDetailItem;
       // 简介
-      if (detail.intro) {
-        md += `## 简介\n\n`;
+      if (detail.intro)
         md += htmlToMarkdown(detail.intro) + '\n\n';
-      }
       // 内容
-      if (detail.content) {
-        md += `## 内容\n\n`;
+      if (detail.content)
         md += htmlToMarkdown(detail.content) + '\n\n';
-      }
       // 传承谱系
       if (detail.pedigree) {
         md += `## 传承谱系\n\n`;
-        md += htmlToMarkdown(detail.pedigree as string) + '\n\n';
-      }
-      // 视频
-      if (detail.video) {
-        md += `## 视频\n\n`;
-        md += `[视频](${detail.video})\n\n`;
+        md += '传承谱: ' + htmlToMarkdown(detail.pedigree as string) + '\n\n';
       }
-      if (detail.publishVideo)
-        md += `[介绍视频](${detail.publishVideo})\n\n`;
       
       // 传承人
       if (detail.inheritorsList && detail.inheritorsList.length > 0) {
-        md += `## 相关传承人\n\n`;
-        if (detail.inheritor) {
+        md += `相关非遗传承人:${detail.inheritorsList.map(inheritor => inheritor.title).join('、')}\n\n`;
+        if (detail.inheritor) 
           md += htmlToMarkdown(detail.inheritor) + '\n\n';
-        }
-        
-        detail.inheritorsList.forEach(inheritor => {
-          md += `### ${inheritor.title}\n\n`;
-          md += `级别:${inheritor.levelLext || '无'}\n\n`;
-
-          md += `#### 数据库索引ID\n\n`;
-          md += `- 类型: inheritor\n`;
-          md += `- ID: ${inheritor.id || '无'}\n\n`;
-        });
       }
       
       // 传习所
-      if (detail.ichSitesList && detail.ichSitesList.length > 0) {
-        md += `## 相关传习所\n\n`;
-        detail.ichSitesList.forEach(site => { 
-          md += `### ${site.title}\n\n`;
-          md += `级别:${site.levelLext || '无'}\n\n`;
-          md += `地址:${site.address || '无'}\n\n`;
-
-          md += `#### 数据库索引ID\n\n`;
-          md += `- 类型: seminar\n`;
-          md += `- ID: ${site.id || '无'}\n\n`;
-        });
-      }
+      if (detail.ichSitesList && detail.ichSitesList.length > 0)
+        md += `相关非遗传习所: ${detail.ichSitesList.map(site => site.title).join('、')}\n\n`; 
 
       // 同级别项目
-      if (detail.otherLevel && detail.otherLevel.length > 0) {
-        md += `## 其他级别非遗项目\n\n`;
-        detail.otherLevel.forEach(project => { 
-          md += `### ${project.title}\n\n`;
-          md += `级别:${project.levelLext || '无'}\n\n`;
-          md += `保护单位:${project.unit || '无'}\n\n`;
-
-          md += `#### 数据库索引ID\n\n`;
-          md += `- 类型: intangible\n`;
-          md += `- ID: ${project.id || '无'}\n\n`;
-        });
-      }
+      if (detail.otherLevel && detail.otherLevel.length > 0)
+        md += '其他级别非遗项目: ' + detail.otherLevel.map(project => project.title).join('、') + `\n\n`;
     }
 
-    await writeFile(path.join(subDir, `${item.id}.md`), md);
+    try {
+      await writeFile(path.join(subDir, `${sanitizeFileNameAdvanced(item.title) || item.id}.md`), md);
+    } catch (err) {
+      console.error(`写入文件 ${item.title || item.id}.md 失败:`, err);
+    }
   }
 }
 async function generateMarkdownInheritor(subDir: string) {
   
   for (const item of data) {
     
-    let md = '';
+    let md = '---\n';    
+    
+    function addMeta(key: string, value: any) {
+      if (value)
+        md += `${key}: ${value}\n`;
+    }
+
+    addMeta('level', item.levelText);
+    addMeta('type', '非遗传承人');
+    addMeta('unit', item.detail?.unit);
+    addMeta('sex', item.detail?.gender == '1'? '男' : '女');
+    addMeta('name', item.title);
+
+    md += `---\n\n`;
+
     // 基本信息
     md += `# ${item.title}\n\n`;
     if (item.desc)
@@ -216,38 +246,43 @@ async function generateMarkdownInheritor(subDir: string) {
       }
       // 相关项目
       if (detail.associationMeList && detail.associationMeList.length > 0) {
-        md += `## 相关项目\n\n`;        
-        detail.associationMeList.forEach(inheritor => {
-          md += `### ${inheritor.title}\n\n`;
-
-          md += `#### 数据库索引ID\n\n`;
-          md += `- 类型: intangible\n`;
-          md += `- ID: ${inheritor.id || '无'}\n\n`;
-        });
+        md += `## 相关项目\n\n`;  
+        md += detail.associationMeList.map(project => project.title).join('、');
+        md += `\n\n`;  
       }   
       // 传习所
       if (detail.ichSitesList && detail.ichSitesList.length > 0) {
         md += `## 相关传习所\n\n`;
-        detail.ichSitesList.forEach(site => { 
-          md += `### ${site.title}\n\n`;
-          md += `级别:${site.levelLext || '无'}\n\n`;
-          md += `地址:${site.address || '无'}\n\n`;
-
-          md += `##### 数据库索引ID\n\n`;
-          md += `- 类型: seminar\n`;
-          md += `- ID: ${site.id || '无'}\n\n`;
-        });
+        md += detail.ichSitesList.map(site => site.title).join('、');
+        md += `\n\n`;  
       }
     }
 
-    await writeFile(path.join(subDir, `${item.id}.md`), md);
+    await writeFile(path.join(subDir, `${sanitizeFileNameAdvanced(item.title) ||item.id}.md`), md);
   }
 }
 async function generateMarkdownArtifact(subDir: string) {
   
   for (const item of data) {
     
-    let md = '';
+    let md = '---\n';    
+    
+    function addMeta(key: string, value: any) {
+      if (value)
+        md += `${key}: ${value}\n`;
+    }
+
+    addMeta('name', item.title);
+    addMeta('crType', item.crTypeText);
+    addMeta('region', item.regionText);
+    addMeta('level', item.levelText);
+    addMeta('address', item.address);
+    addMeta('age', item.age);
+    addMeta('type', '非遗文物');
+    addMeta('unit', item.detail?.unit);
+
+    md += `---\n\n`;
+
     // 基本信息
     md += `# ${item.title}\n\n`;
     if (item.desc)
@@ -302,7 +337,7 @@ async function generateMarkdownArtifact(subDir: string) {
       }
     }
 
-    await writeFile(path.join(subDir, `${item.id}.md`), md);
+    await writeFile(path.join(subDir, `${sanitizeFileNameAdvanced(item.title) ||item.id}.md`), md);
   }
 }
 
@@ -381,10 +416,92 @@ async function main() {
       generateMarkdownArtifact(dir);
       break;
     }
+    case 'publishToDify': {
+      const datasetId = argv[3];
+      const localDir = argv[4];
+      if (!datasetId || !localDir) {
+        console.error('请提供 datasetId 和 localDir 参数');
+        console.error('用法: node ich.js publishToDify <datasetId> <localDir>');
+        break;
+      }
+      await publishToDify(datasetId, localDir);
+      break;
+    }
     default:
       console.log('不支持的类型');
       break;
   }
 }
 
+async function publishToDify(datasetId: string, localDir: string) {
+  // Dify API 配置
+  const DIFY_API_KEY = process.env.DIFY_API_KEY || 'dataset-ZELjB79MnbcvCeyjEVMEuTmB';
+  const DIFY_API_URL = 'http://localhost:8089/v1/datasets';
+
+  if (!DIFY_API_KEY) {
+    console.error('请设置 DIFY_API_KEY 环境变量');
+    return;
+  }
+
+  if (!fs.existsSync(localDir)) {
+    console.error(`本地目录不存在: ${localDir}`);
+    return;
+  }
+
+  console.log(`开始上传文档到 Dify 知识库 (ID: ${datasetId})...`);
+  console.log(`本地目录: ${localDir}`);
+
+  try {
+    // 读取本地目录中的所有 md 文件
+    const files = fs.readdirSync(localDir);
+    const mdFiles = files.filter(file => file.endsWith('.md'));
+
+    if (mdFiles.length === 0) {
+      console.log('目录中没有找到 .md 文件');
+      return;
+    }
+
+    console.log(`找到 ${mdFiles.length} 个 .md 文件`);
+
+    // 逐个上传文件
+    for (const file of mdFiles) {
+      console.log(`正在上传: ${file}`);
+
+      const filePath = path.join(localDir, file);
+      const content = fs.readFileSync(filePath, 'utf8');
+
+      // 构建请求数据
+      const requestData = JSON.stringify({
+        name: file,
+        text: content,
+        indexing_technique: 'high_quality',  
+        doc_language: 'zh',
+      });
+
+      // 发送上传请求
+      const response = await fetch(`${DIFY_API_URL}/${datasetId}/document/create-by-text`, {
+        method: 'POST',
+        headers: {
+          'Content-Type': 'application/json',
+          'Authorization': `Bearer ${DIFY_API_KEY}`
+        },
+        body: requestData
+      });
+
+      if (response.ok) {
+         await response.json();
+        console.log(`✅ 上传成功: ${file})`);
+      } else {
+        const error = await response.json();
+        console.error(`❌ 上传失败: ${file}`);
+        console.error(`   错误信息: ${(error as Error).message || response.statusText}`);
+      }
+    }
+
+    console.log('\n🎉 所有文档上传完成!');
+  } catch (error) {
+    console.error('上传过程中发生错误:', error);
+  }
+}
+
 main();

+ 5 - 2
scripts/ExportToAi/system.txt

@@ -6,8 +6,11 @@
 - 如果用户的问题模糊或不完整,请基于知识库尝试合理澄清,但不要假设意图。
 - 如涉及法律、医疗、财务等专业领域,须注明“本回答仅供参考,不构成专业建议”。
 
+您好,我是“闽南文化生态保护区(厦门市)”的智能问答助手,可以为我有关于闽南文化相关的知识
 - 为我介绍南音的简介
-- 厦门有哪些国家级非遗项目
-- 南音有哪些非遗传承人
+- 厦门有哪些著名国家级非遗项目?
+- 厦门有几个市级非遗项目?
+- 南音有哪些非遗传承人?
+- 为我介绍庄海蓉
 
 包含了有关于闽南文化生态保护区(厦门市)的全部非遗项目的列表与详情

+ 1 - 1
tsconfig.ich.json

@@ -8,7 +8,7 @@
     "skipLibCheck": true,
     "strict": true,
     "outDir": "./dist",
-    "lib": ["ES2020"],
+    "lib": ["ES2022"],
     "baseUrl": ".",
     "paths": {
       "@/*": ["./*"]

+ 4 - 0
webpack.ich.config.js

@@ -19,6 +19,10 @@ export default {
       '@': path.resolve(__dirname),
     }
   },
+  optimization: {
+    minimize: false,
+  },
+  devtool: 'source-map',
   module: {
     rules: [
       {