ich.ts 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508
  1. import { writeFile } from 'fs/promises';
  2. import CommonContent, { GetContentListParams, type GetContentDetailItem, type GetContentListItem } from '../../api/CommonContent';
  3. import ProjectsContent from '../../api/inheritor/ProjectsContent';
  4. import InheritorContent from '../../api/inheritor/InheritorContent';
  5. import SeminarContent from '../../api/inheritor/SeminarContent';
  6. import UnitContent from '../../api/inheritor/UnitContent';
  7. import UnmoveableContent from '../../api/inheritor/UnmoveableContent';
  8. import path from 'path';
  9. import fs from 'fs';
  10. import { argv, cwd } from 'process';
  11. const data = [] as Array<GetContentListItem & { detail?: GetContentDetailItem }>;
  12. // HTML转Markdown的简单实现
  13. function htmlToMarkdown(html: string): string {
  14. if (!html) return '';
  15. // 处理标题
  16. html = html.replace(/<h1[^>]*>(.*?)<\/h1>/gi, '# $1\n\n');
  17. html = html.replace(/<h2[^>]*>(.*?)<\/h2>/gi, '## $1\n\n');
  18. html = html.replace(/<h3[^>]*>(.*?)<\/h3>/gi, '### $1\n\n');
  19. // 处理段落
  20. html = html.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1');
  21. // 处理加粗
  22. html = html.replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**');
  23. html = html.replace(/<b[^>]*>(.*?)<\/b>/gi, '**$1**');
  24. // 处理斜体
  25. html = html.replace(/<em[^>]*>(.*?)<\/em>/gi, '*$1*');
  26. html = html.replace(/<i[^>]*>(.*?)<\/i>/gi, '*$1*');
  27. // 处理列表
  28. html = html.replace(/<ul[^>]*>([\s\S]*?)<\/ul>/gi, (match, content) => {
  29. return content.replace(/<li[^>]*>(.*?)<\/li>/gi, '- $1\n') + '\n';
  30. });
  31. html = html.replace(/<ol[^>]*>([\s\S]*?)<\/ol>/gi, (match, content, index) => {
  32. let count = 1;
  33. return content.replace(/<li[^>]*>(.*?)<\/li>/gi, () => {
  34. return `${count++}. $1\n`;
  35. }) + '\n';
  36. });
  37. // 处理图片
  38. html = html.replace(/<img[^>]*src="([^"]*)"[^>]*alt="([^"]*)"[^>]*>/gi, '![$2]($1)');
  39. // 处理链接
  40. html = html.replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)');
  41. // 处理换行
  42. html = html.replace(/<br[^>]*>/gi, ' ');
  43. // 去除所有HTML标签
  44. html = html.replace(/<[^>]*>/g, '');
  45. // 处理多余的换行
  46. html = html.replace(/\n\s*\n/g, '\n\n');
  47. return html.trim();
  48. }
  49. function sanitizeFileNameAdvanced(fileName: string, replacement = '_') {
  50. if (typeof fileName !== 'string') return '';
  51. // 1. 定义跨平台非法字符正则(核心非法字符+不可见控制字符)
  52. const illegalRegex = /[<>:"/\\|?*\x00-\x1F]/g; // \x00-\x1F 是不可见ASCII控制字符
  53. // 2. 定义Windows保留文件名(不区分大小写)
  54. const windowsReservedNames = new Set([
  55. 'con', 'nul', 'prn', 'aux', 'com1', 'com2', 'com3', 'com4',
  56. 'com5', 'com6', 'com7', 'com8', 'com9', 'lpt1', 'lpt2',
  57. 'lpt3', 'lpt4', 'lpt5', 'lpt6', 'lpt7', 'lpt8', 'lpt9'
  58. ]);
  59. // 步骤1:替换非法字符
  60. let safeFileName = fileName.replace(illegalRegex, replacement);
  61. // 步骤2:清理首尾空格、句点,压缩连续替换符(避免多个下划线连在一起)
  62. safeFileName = safeFileName
  63. .trim()
  64. .replace(/\.$/, '') // 移除结尾句点
  65. .replace(new RegExp(`${replacement}+`, 'g'), replacement); // 压缩连续替换符
  66. // 步骤3:处理Windows保留文件名
  67. const fileNameWithoutExt = safeFileName.split('.')[0].toLowerCase();
  68. if (windowsReservedNames.has(fileNameWithoutExt)) {
  69. safeFileName = `${safeFileName}_${Date.now()}`; // 追加时间戳避免冲突
  70. }
  71. // 步骤4:限制文件名长度(Windows最大255字符,这里取200字符留有余地)
  72. const maxFileNameLength = 200;
  73. if (safeFileName.length > maxFileNameLength) {
  74. const ext = safeFileName.includes('.') ? safeFileName.split('.').pop() : '';
  75. const name = safeFileName.slice(0, maxFileNameLength - (ext ? ext.length + 1 : 0));
  76. safeFileName = ext ? `${name}.${ext}` : name;
  77. }
  78. // 步骤5:兜底空文件名
  79. return safeFileName || 'unnamed_file';
  80. }
  81. // 生成Markdown文本
  82. async function generateMarkdownIch(subDir: string, type: string) {
  83. for (const item of data) {
  84. let md = '---\n';
  85. function addMeta(key: string, value: any) {
  86. if (value)
  87. md += `${key}: ${value}\n`;
  88. }
  89. addMeta('level', item.levelText);
  90. addMeta('crType', item.crTypeText);
  91. addMeta('region', item.regionText);
  92. addMeta('batch', item.batchText);
  93. addMeta('ichType', item.ichTypeText);
  94. addMeta('type', type);
  95. addMeta('unit', item.detail?.unit);
  96. addMeta('name', item.title);
  97. addMeta('address', item.address);
  98. md += `---\n\n`;
  99. // 基本信息
  100. md += `# ${item.title}\n\n`;
  101. if (item.desc)
  102. md += `${item.desc}\n\n`;
  103. md += `类型:${type}\n\n`;
  104. function addRow(key: string, value: any) {
  105. if (value)
  106. md += `- ${key}: ${value}\n`;
  107. }
  108. addRow('非遗级别', item.levelText);
  109. addRow('非遗类别', item.ichTypeText);
  110. addRow('地区', item.regionText);
  111. addRow('批次', item.batchText);
  112. addRow('保护单位', item.unit);
  113. addRow('地址', item.address);
  114. addRow('字号名称', item.fontName);
  115. addRow('认定类型', item.brandType);
  116. md += `数据库索引ID: 类型: intangible ID: ${item.id || '无'}\n\n`;
  117. // 详细信息
  118. if (item.detail) {
  119. const detail = item.detail as GetContentDetailItem;
  120. // 简介
  121. if (detail.intro)
  122. md += htmlToMarkdown(detail.intro) + '\n\n';
  123. // 内容
  124. if (detail.content)
  125. md += htmlToMarkdown(detail.content) + '\n\n';
  126. // 传承谱系
  127. if (detail.pedigree) {
  128. md += `## 传承谱系\n\n`;
  129. md += '传承谱: ' + htmlToMarkdown(detail.pedigree as string) + '\n\n';
  130. }
  131. // 传承人
  132. if (detail.inheritorsList && detail.inheritorsList.length > 0) {
  133. md += `相关非遗传承人:${detail.inheritorsList.map(inheritor => inheritor.title).join('、')}\n\n`;
  134. if (detail.inheritor)
  135. md += htmlToMarkdown(detail.inheritor) + '\n\n';
  136. }
  137. // 传习所
  138. if (detail.ichSitesList && detail.ichSitesList.length > 0)
  139. md += `相关非遗传习所: ${detail.ichSitesList.map(site => site.title).join('、')}\n\n`;
  140. // 同级别项目
  141. if (detail.otherLevel && detail.otherLevel.length > 0)
  142. md += '其他级别非遗项目: ' + detail.otherLevel.map(project => project.title).join('、') + `\n\n`;
  143. }
  144. try {
  145. await writeFile(path.join(subDir, `${sanitizeFileNameAdvanced(item.title) || item.id}.md`), md);
  146. } catch (err) {
  147. console.error(`写入文件 ${item.title || item.id}.md 失败:`, err);
  148. }
  149. }
  150. }
  151. async function generateMarkdownInheritor(subDir: string) {
  152. for (const item of data) {
  153. let md = '---\n';
  154. function addMeta(key: string, value: any) {
  155. if (value)
  156. md += `${key}: ${value}\n`;
  157. }
  158. addMeta('level', item.levelText);
  159. addMeta('type', '非遗传承人');
  160. addMeta('unit', item.detail?.unit);
  161. addMeta('sex', item.detail?.gender == '1'? '男' : '女');
  162. addMeta('name', item.title);
  163. md += `---\n\n`;
  164. // 基本信息
  165. md += `# ${item.title}\n\n`;
  166. if (item.desc)
  167. md += `${item.desc}\n\n`;
  168. md += `## 基本信息\n\n`;
  169. md += `类型:非遗传承人\n\n`;
  170. function addRow(key: string, value: any) {
  171. if (value)
  172. md += `- ${key}: ${value}\n`;
  173. }
  174. addRow('民族', item.detail?.nation);
  175. addRow('性别', item.detail?.gender == '1'? '男' : '女');
  176. addRow('出生日期', item.detail?.dateBirth);
  177. addRow('出生地区', item.detail?.birthplace);
  178. addRow('单位', item.detail?.unit);
  179. addRow('传承项目', item.detail?.associationMeList[0]?.title);
  180. addRow('传承人级别', item.detail?.batchText);
  181. addRow('公布批次', item.detail?.batchText);
  182. md += `\n## 数据库索引ID\n\n`;
  183. md += `- 类型: inheritor\n`;
  184. md += `- ID: ${item.id || '无'}\n\n`;
  185. // 详细信息
  186. if (item.detail) {
  187. const detail = item.detail as GetContentDetailItem;
  188. // 简介
  189. if (detail.intro) {
  190. md += `## 简介\n\n`;
  191. md += htmlToMarkdown(detail.intro) + '\n\n';
  192. }
  193. if (detail.content) {
  194. md += `## 详情\n\n`;
  195. md += htmlToMarkdown(detail.content) + '\n\n';
  196. }
  197. // 奖项
  198. if (detail.prize) {
  199. md += `## 奖项\n\n`;
  200. md += htmlToMarkdown(detail.prize as string) + '\n\n';
  201. }
  202. // 相关项目
  203. if (detail.associationMeList && detail.associationMeList.length > 0) {
  204. md += `## 相关项目\n\n`;
  205. md += detail.associationMeList.map(project => project.title).join('、');
  206. md += `\n\n`;
  207. }
  208. // 传习所
  209. if (detail.ichSitesList && detail.ichSitesList.length > 0) {
  210. md += `## 相关传习所\n\n`;
  211. md += detail.ichSitesList.map(site => site.title).join('、');
  212. md += `\n\n`;
  213. }
  214. }
  215. await writeFile(path.join(subDir, `${sanitizeFileNameAdvanced(item.title) ||item.id}.md`), md);
  216. }
  217. }
  218. async function generateMarkdownArtifact(subDir: string) {
  219. for (const item of data) {
  220. let md = '---\n';
  221. function addMeta(key: string, value: any) {
  222. if (value)
  223. md += `${key}: ${value}\n`;
  224. }
  225. addMeta('name', item.title);
  226. addMeta('crType', item.crTypeText);
  227. addMeta('region', item.regionText);
  228. addMeta('level', item.levelText);
  229. addMeta('address', item.address);
  230. addMeta('age', item.age);
  231. addMeta('type', '非遗文物');
  232. addMeta('unit', item.detail?.unit);
  233. md += `---\n\n`;
  234. // 基本信息
  235. md += `# ${item.title}\n\n`;
  236. if (item.desc)
  237. md += `${item.desc}\n\n`;
  238. md += `## 基本信息\n\n`;
  239. md += `类型:非遗传承人\n\n`;
  240. function addRow(key: string, value: any) {
  241. if (value)
  242. md += `- ${key}: ${value}\n`;
  243. }
  244. addRow('开放时间', item.detail?.openStatusText);
  245. addRow('年代', item.age);
  246. addRow('级别', item.levelText);
  247. addRow('所属区域', item.regionText);
  248. addRow('文物类型', item.crTypeText);
  249. addRow('单位', item.detail?.unit);
  250. md += `\n## 数据库索引ID\n\n`;
  251. md += `- 类型: artifact\n`;
  252. md += `- ID: ${item.id || '无'}\n\n`;
  253. if (item.video) {
  254. md += `## 视频\n\n`;
  255. md += `![${item.title}视频](${item.video})\n\n`;
  256. }
  257. // 详细信息
  258. if (item.detail) {
  259. const detail = item.detail as GetContentDetailItem;
  260. // 简介
  261. if (detail.intro) {
  262. md += `## 简介\n\n`;
  263. md += htmlToMarkdown(detail.intro) + '\n\n';
  264. }
  265. if (detail.content) {
  266. md += `## 详情\n\n`;
  267. md += htmlToMarkdown(detail.content) + '\n\n';
  268. }
  269. // 奖项
  270. if (detail.protectedArea) {
  271. md += `## 保护范围\n\n`;
  272. md += htmlToMarkdown(detail.protectedArea as string) + '\n\n';
  273. }
  274. if (detail.environment) {
  275. md += `## 建筑环境\n\n`;
  276. md += htmlToMarkdown(detail.environment as string) + '\n\n';
  277. }
  278. if (detail.价值评估) {
  279. md += `## 价值评估\n\n`;
  280. md += htmlToMarkdown(detail.价值评估 as string) + '\n\n';
  281. }
  282. }
  283. await writeFile(path.join(subDir, `${sanitizeFileNameAdvanced(item.title) ||item.id}.md`), md);
  284. }
  285. }
  286. async function main() {
  287. const type = argv[2];
  288. function makeDir(nanme: string) {
  289. const dir = path.join(cwd(), `dist/${nanme}`);
  290. if (!fs.existsSync(dir))
  291. fs.mkdirSync(dir, { recursive: true });
  292. return dir;
  293. }
  294. switch (type) {
  295. case 'ich': {
  296. const dir = makeDir('ich');
  297. (await ProjectsContent.getContentList(new GetContentListParams(), 1, 1000)).list.forEach(item => {
  298. data.push(item);
  299. });
  300. for (const item of data)
  301. item.detail = (await ProjectsContent.getContentDetail(item.id)) as GetContentDetailItem;
  302. generateMarkdownIch(dir, '非遗项目');
  303. break;
  304. }
  305. case 'seminar': {
  306. const dir = makeDir('seminar');
  307. (await SeminarContent.getContentList(new GetContentListParams(), 1, 1000)).list.forEach(item => {
  308. data.push(item);
  309. });
  310. for (const item of data)
  311. item.detail = (await SeminarContent.getContentDetail(item.id)) as GetContentDetailItem;
  312. generateMarkdownIch(dir, '非遗传习所');
  313. break;
  314. }
  315. case 'old': {
  316. const dir = makeDir('old');
  317. (await CommonContent.getContentList(new GetContentListParams()
  318. .setModelId(17)
  319. .setMainBodyColumnId(312)
  320. , 1, 1000)).list.forEach(item => {
  321. data.push(item);
  322. });
  323. for (const item of data)
  324. item.detail = (await CommonContent.getContentDetail(item.id)) as GetContentDetailItem;
  325. generateMarkdownIch(dir, '老字号');
  326. break;
  327. }
  328. case 'unit': {
  329. const dir = makeDir('unit');
  330. (await UnitContent.getContentList(new GetContentListParams(), 1, 1000)).list.forEach(item => {
  331. data.push(item);
  332. });
  333. for (const item of data)
  334. item.detail = (await UnitContent.getContentDetail(item.id)) as GetContentDetailItem;
  335. generateMarkdownIch(dir, '非遗保护单位');
  336. break;
  337. }
  338. case 'inheritor': {
  339. const dir = makeDir('inheritor');
  340. (await InheritorContent.getContentList(new GetContentListParams(), 1, 1000)).list.forEach(item => {
  341. data.push(item);
  342. });
  343. for (const item of data)
  344. item.detail = (await InheritorContent.getContentDetail(item.id)) as GetContentDetailItem;
  345. generateMarkdownInheritor(dir);
  346. break;
  347. }
  348. case 'artifact': {
  349. const dir = makeDir('artifact');
  350. (await UnmoveableContent.getContentList(new GetContentListParams(), 1, 1000)).list.forEach(item => {
  351. data.push(item);
  352. });
  353. for (const item of data)
  354. item.detail = (await UnmoveableContent.getContentDetail(item.id)) as GetContentDetailItem;
  355. generateMarkdownArtifact(dir);
  356. break;
  357. }
  358. case 'publishToDify': {
  359. const datasetId = argv[3];
  360. const localDir = argv[4];
  361. if (!datasetId || !localDir) {
  362. console.error('请提供 datasetId 和 localDir 参数');
  363. console.error('用法: node ich.js publishToDify <datasetId> <localDir>');
  364. break;
  365. }
  366. await publishToDify(datasetId, localDir);
  367. break;
  368. }
  369. default:
  370. console.log('不支持的类型');
  371. break;
  372. }
  373. }
  374. async function publishToDify(datasetId: string, localDir: string) {
  375. // Dify API 配置
  376. const DIFY_API_KEY = process.env.DIFY_API_KEY || 'dataset-ZELjB79MnbcvCeyjEVMEuTmB';
  377. const DIFY_API_URL = 'http://localhost:8089/v1/datasets';
  378. if (!DIFY_API_KEY) {
  379. console.error('请设置 DIFY_API_KEY 环境变量');
  380. return;
  381. }
  382. if (!fs.existsSync(localDir)) {
  383. console.error(`本地目录不存在: ${localDir}`);
  384. return;
  385. }
  386. console.log(`开始上传文档到 Dify 知识库 (ID: ${datasetId})...`);
  387. console.log(`本地目录: ${localDir}`);
  388. try {
  389. // 读取本地目录中的所有 md 文件
  390. const files = fs.readdirSync(localDir);
  391. const mdFiles = files.filter(file => file.endsWith('.md'));
  392. if (mdFiles.length === 0) {
  393. console.log('目录中没有找到 .md 文件');
  394. return;
  395. }
  396. console.log(`找到 ${mdFiles.length} 个 .md 文件`);
  397. // 逐个上传文件
  398. for (const file of mdFiles) {
  399. console.log(`正在上传: ${file}`);
  400. const filePath = path.join(localDir, file);
  401. const content = fs.readFileSync(filePath, 'utf8');
  402. // 构建请求数据
  403. const requestData = JSON.stringify({
  404. name: file,
  405. text: content,
  406. indexing_technique: 'high_quality',
  407. doc_language: 'zh',
  408. });
  409. // 发送上传请求
  410. const response = await fetch(`${DIFY_API_URL}/${datasetId}/document/create-by-text`, {
  411. method: 'POST',
  412. headers: {
  413. 'Content-Type': 'application/json',
  414. 'Authorization': `Bearer ${DIFY_API_KEY}`
  415. },
  416. body: requestData
  417. });
  418. if (response.ok) {
  419. await response.json();
  420. console.log(`✅ 上传成功: ${file})`);
  421. } else {
  422. const error = await response.json();
  423. console.error(`❌ 上传失败: ${file}`);
  424. console.error(` 错误信息: ${(error as Error).message || response.statusText}`);
  425. }
  426. }
  427. console.log('\n🎉 所有文档上传完成!');
  428. } catch (error) {
  429. console.error('上传过程中发生错误:', error);
  430. }
  431. }
  432. main();