ich.ts 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497
  1. import { writeFile } from 'fs/promises';
  2. import CommonContent, { GetContentListParams, type GetContentDetailItem, type GetContentListItem } from '../../api/CommonContent';
  3. import ProjectsContent from '../../api/inheritor/ProjectsContent';
  4. import InheritorContent from '../../api/inheritor/InheritorContent';
  5. import SeminarContent from '../../api/inheritor/SeminarContent';
  6. import UnitContent from '../../api/inheritor/UnitContent';
  7. import UnmoveableContent from '../../api/inheritor/UnmoveableContent';
  8. import path from 'path';
  9. import fs from 'fs';
  10. import { argv, cwd } from 'process';
  11. const data = [] as Array<GetContentListItem & { detail?: GetContentDetailItem }>;
  12. // HTML转Markdown的简单实现
  13. function htmlToMarkdown(html: string): string {
  14. if (!html) return '';
  15. // 处理标题
  16. html = html.replace(/<h1[^>]*>(.*?)<\/h1>/gi, '# $1\n\n');
  17. html = html.replace(/<h2[^>]*>(.*?)<\/h2>/gi, '## $1\n\n');
  18. html = html.replace(/<h3[^>]*>(.*?)<\/h3>/gi, '### $1\n\n');
  19. // 处理段落
  20. html = html.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1');
  21. // 处理加粗
  22. html = html.replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**');
  23. html = html.replace(/<b[^>]*>(.*?)<\/b>/gi, '**$1**');
  24. // 处理斜体
  25. html = html.replace(/<em[^>]*>(.*?)<\/em>/gi, '*$1*');
  26. html = html.replace(/<i[^>]*>(.*?)<\/i>/gi, '*$1*');
  27. // 处理列表
  28. html = html.replace(/<ul[^>]*>([\s\S]*?)<\/ul>/gi, (match, content) => {
  29. return content.replace(/<li[^>]*>(.*?)<\/li>/gi, '- $1\n') + '\n';
  30. });
  31. html = html.replace(/<ol[^>]*>([\s\S]*?)<\/ol>/gi, (match, content, index) => {
  32. let count = 1;
  33. return content.replace(/<li[^>]*>(.*?)<\/li>/gi, () => {
  34. return `${count++}. $1\n`;
  35. }) + '\n';
  36. });
  37. // 处理图片
  38. html = html.replace(/<img[^>]*src="([^"]*)"[^>]*alt="([^"]*)"[^>]*>/gi, '![$2]($1)');
  39. // 处理链接
  40. html = html.replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)');
  41. // 处理换行
  42. html = html.replace(/<br[^>]*>/gi, ' ');
  43. // 去除所有HTML标签
  44. html = html.replace(/<[^>]*>/g, '');
  45. // 处理多余的换行
  46. html = html.replace(/\n\s*\n/g, '\n\n');
  47. return html.trim();
  48. }
  49. function sanitizeFileNameAdvanced(fileName: string, replacement = '_') {
  50. if (typeof fileName !== 'string') return '';
  51. // 1. 定义跨平台非法字符正则(核心非法字符+不可见控制字符)
  52. const illegalRegex = /[<>:"/\\|?*\x00-\x1F]/g; // \x00-\x1F 是不可见ASCII控制字符
  53. // 2. 定义Windows保留文件名(不区分大小写)
  54. const windowsReservedNames = new Set([
  55. 'con', 'nul', 'prn', 'aux', 'com1', 'com2', 'com3', 'com4',
  56. 'com5', 'com6', 'com7', 'com8', 'com9', 'lpt1', 'lpt2',
  57. 'lpt3', 'lpt4', 'lpt5', 'lpt6', 'lpt7', 'lpt8', 'lpt9'
  58. ]);
  59. // 步骤1:替换非法字符
  60. let safeFileName = fileName.replace(illegalRegex, replacement);
  61. // 步骤2:清理首尾空格、句点,压缩连续替换符(避免多个下划线连在一起)
  62. safeFileName = safeFileName
  63. .trim()
  64. .replace(/\.$/, '') // 移除结尾句点
  65. .replace(new RegExp(`${replacement}+`, 'g'), replacement); // 压缩连续替换符
  66. // 步骤3:处理Windows保留文件名
  67. const fileNameWithoutExt = safeFileName.split('.')[0].toLowerCase();
  68. if (windowsReservedNames.has(fileNameWithoutExt)) {
  69. safeFileName = `${safeFileName}_${Date.now()}`; // 追加时间戳避免冲突
  70. }
  71. // 步骤4:限制文件名长度(Windows最大255字符,这里取200字符留有余地)
  72. const maxFileNameLength = 200;
  73. if (safeFileName.length > maxFileNameLength) {
  74. const ext = safeFileName.includes('.') ? safeFileName.split('.').pop() : '';
  75. const name = safeFileName.slice(0, maxFileNameLength - (ext ? ext.length + 1 : 0));
  76. safeFileName = ext ? `${name}.${ext}` : name;
  77. }
  78. // 步骤5:兜底空文件名
  79. return safeFileName || 'unnamed_file';
  80. }
  81. // 生成Markdown文本
  82. async function generateMarkdownIch(subDir: string, type: string) {
  83. for (const item of data) {
  84. let md = '---\n';
  85. function addMeta(key: string, value: any) {
  86. if (value)
  87. md += `${key}: ${value}\n`;
  88. }
  89. addMeta('level', item.levelText);
  90. addMeta('crType', item.crTypeText);
  91. addMeta('region', item.regionText);
  92. addMeta('batch', item.batchText);
  93. addMeta('ichType', item.ichTypeText);
  94. addMeta('type', type);
  95. addMeta('unit', item.detail?.unit);
  96. addMeta('name', item.title);
  97. addMeta('address', item.address);
  98. md += `---\n\n`;
  99. // 基本信息
  100. md += `# ${item.title}\n\n`;
  101. if (item.desc)
  102. md += `${item.desc}\n\n`;
  103. md += `${item.title}类型:${type}\n`;
  104. function addRow(key: string, value: any) {
  105. if (value)
  106. md += `- ${key}: ${value}\n`;
  107. }
  108. addRow(`${item.title}非遗级别`, item.levelText);
  109. addRow(`${item.title}非遗类别`, item.ichTypeText);
  110. addRow(`${item.title}地区`, item.regionText);
  111. addRow(`${item.title}批次`, item.batchText);
  112. addRow(`${item.title}保护单位`, item.unit);
  113. addRow(`${item.title}地址`, item.address);
  114. addRow(`${item.title}字号名称`, item.fontName);
  115. addRow(`${item.title}认定类型`, item.brandType);
  116. md += `${item.title}数据库索引ID: 类型: intangible ID: ${item.id || '无'}\n\n`;
  117. // 详细信息
  118. if (item.detail) {
  119. const detail = item.detail as GetContentDetailItem;
  120. // 简介
  121. if (detail.intro)
  122. md += htmlToMarkdown(detail.intro) + '\n\n';
  123. // 内容
  124. if (detail.content)
  125. md += htmlToMarkdown(detail.content) + '\n\n';
  126. // 传承人
  127. if (detail.inheritorsList && detail.inheritorsList.length > 0) {
  128. md += `${item.title}相关非遗传承人:${detail.inheritorsList.map(inheritor => inheritor.title).join('、')}\n\n`;
  129. if (detail.inheritor)
  130. md += htmlToMarkdown(detail.inheritor) + '\n\n';
  131. }
  132. // 传习所
  133. if (detail.ichSitesList && detail.ichSitesList.length > 0)
  134. md += `${item.title}相关非遗传习所: ${detail.ichSitesList.map(site => site.title).join('、')}\n\n`;
  135. // 同级别项目
  136. if (detail.otherLevel && detail.otherLevel.length > 0)
  137. md += `${item.title}其他级别: ` + detail.otherLevel.map(project => project.levelText).join('、') + `\n\n`;
  138. // 传承谱系
  139. if (detail.pedigree) {
  140. md += `${item.title}的传承谱系: ` + htmlToMarkdown(detail.pedigree as string) + '\n\n';
  141. }
  142. }
  143. try {
  144. await writeFile(path.join(subDir, `${sanitizeFileNameAdvanced(item.title) || item.id}.md`), md);
  145. } catch (err) {
  146. console.error(`写入文件 ${item.title || item.id}.md 失败:`, err);
  147. }
  148. }
  149. }
  150. async function generateMarkdownInheritor(subDir: string) {
  151. for (const item of data) {
  152. let md = '---\n';
  153. function addMeta(key: string, value: any) {
  154. if (value)
  155. md += `${key}: ${value}\n`;
  156. }
  157. addMeta('level', item.levelText);
  158. addMeta('type', '非遗传承人');
  159. addMeta('unit', item.detail?.unit);
  160. addMeta('sex', item.detail?.gender == '1'? '男' : '女');
  161. addMeta('name', item.title);
  162. md += `---\n\n`;
  163. // 基本信息
  164. md += `# ${item.title}\n\n`;
  165. if (item.desc)
  166. md += `${item.desc}\n\n`;
  167. md += `## 基本信息\n\n`;
  168. md += `${item.title}类型:非遗传承人\n`;
  169. function addRow(key: string, value: any) {
  170. if (value)
  171. md += `- ${key}: ${value}\n`;
  172. }
  173. addRow('民族', item.detail?.nation);
  174. addRow('性别', item.detail?.gender == '1'? '男' : '女');
  175. addRow('出生日期', item.detail?.dateBirth);
  176. addRow('出生地区', item.detail?.birthplace);
  177. addRow('单位', item.detail?.unit);
  178. addRow('传承项目', item.detail?.associationMeList[0]?.title);
  179. addRow('传承人级别', item.detail?.batchText);
  180. addRow('公布批次', item.detail?.batchText);
  181. md += `\n## ${item.title}数据库索引ID\n\n`;
  182. md += `- 类型: inheritor\n`;
  183. md += `- ID: ${item.id || '无'}\n\n`;
  184. // 详细信息
  185. if (item.detail) {
  186. const detail = item.detail as GetContentDetailItem;
  187. // 简介
  188. if (detail.intro) {
  189. md += `## ${item.title}简介\n\n`;
  190. md += htmlToMarkdown(detail.intro) + '\n\n';
  191. }
  192. if (detail.content) {
  193. md += htmlToMarkdown(detail.content) + '\n\n';
  194. }
  195. // 奖项
  196. if (detail.prize) {
  197. md += htmlToMarkdown(detail.prize as string) + '\n\n';
  198. }
  199. // 相关项目
  200. if (detail.associationMeList && detail.associationMeList.length > 0)
  201. md += `${item.title}相关项目${detail.associationMeList.map(project => project.title).join('、')}\n\n`;
  202. // 传习所
  203. if (detail.ichSitesList && detail.ichSitesList.length > 0)
  204. md += `${item.title}相关非遗传习所: ${detail.ichSitesList.map(site => site.title).join('、')}\n\n`;
  205. }
  206. await writeFile(path.join(subDir, `${sanitizeFileNameAdvanced(item.title) ||item.id}.md`), md);
  207. }
  208. }
  209. async function generateMarkdownArtifact(subDir: string) {
  210. for (const item of data) {
  211. let md = '---\n';
  212. function addMeta(key: string, value: any) {
  213. if (value)
  214. md += `${key}: ${value}\n`;
  215. }
  216. addMeta('name', item.title);
  217. addMeta('crType', item.crTypeText);
  218. addMeta('region', item.regionText);
  219. addMeta('level', item.levelText);
  220. addMeta('address', item.address);
  221. addMeta('age', item.age);
  222. addMeta('type', '非遗文物');
  223. addMeta('unit', item.detail?.unit);
  224. md += `---\n\n`;
  225. // 基本信息
  226. md += `# ${item.title}\n\n`;
  227. if (item.desc)
  228. md += `${item.desc}\n\n`;
  229. md += `## 基本信息\n\n`;
  230. md += `类型:非遗文物\n`;
  231. function addRow(key: string, value: any) {
  232. if (value)
  233. md += `- ${key}: ${value}\n`;
  234. }
  235. addRow('开放时间', item.detail?.openStatusText);
  236. addRow('年代', item.age);
  237. addRow('级别', item.levelText);
  238. addRow('所属区域', item.regionText);
  239. addRow('文物类型', item.crTypeText);
  240. addRow('单位', item.detail?.unit);
  241. md += `\n## 数据库索引ID\n\n`;
  242. md += `- 类型: artifact\n`;
  243. md += `- ID: ${item.id || '无'}\n\n`;
  244. if (item.video) {
  245. md += `## 视频\n\n`;
  246. md += `![${item.title}视频](${item.video})\n\n`;
  247. }
  248. // 详细信息
  249. if (item.detail) {
  250. const detail = item.detail as GetContentDetailItem;
  251. // 简介
  252. if (detail.intro) {
  253. md += `## 简介\n\n`;
  254. md += htmlToMarkdown(detail.intro) + '\n\n';
  255. }
  256. if (detail.content) {
  257. md += `## 详情\n\n`;
  258. md += htmlToMarkdown(detail.content) + '\n\n';
  259. }
  260. // 奖项
  261. if (detail.protectedArea) {
  262. md += `## 保护范围\n\n`;
  263. md += htmlToMarkdown(detail.protectedArea as string) + '\n\n';
  264. }
  265. if (detail.environment) {
  266. md += `## 建筑环境\n\n`;
  267. md += htmlToMarkdown(detail.environment as string) + '\n\n';
  268. }
  269. if (detail.价值评估) {
  270. md += `## 价值评估\n\n`;
  271. md += htmlToMarkdown(detail.价值评估 as string) + '\n\n';
  272. }
  273. }
  274. await writeFile(path.join(subDir, `${sanitizeFileNameAdvanced(item.title) ||item.id}.md`), md);
  275. }
  276. }
  277. async function main() {
  278. const type = argv[2];
  279. function makeDir(nanme: string) {
  280. const dir = path.join(cwd(), `dist/${nanme}`);
  281. if (!fs.existsSync(dir))
  282. fs.mkdirSync(dir, { recursive: true });
  283. return dir;
  284. }
  285. switch (type) {
  286. case 'ich': {
  287. const dir = makeDir('ich');
  288. (await ProjectsContent.getContentList(new GetContentListParams(), 1, 1000)).list.forEach(item => {
  289. data.push(item);
  290. });
  291. for (const item of data)
  292. item.detail = (await ProjectsContent.getContentDetail(item.id)) as GetContentDetailItem;
  293. generateMarkdownIch(dir, '非遗项目');
  294. break;
  295. }
  296. case 'seminar': {
  297. const dir = makeDir('seminar');
  298. (await SeminarContent.getContentList(new GetContentListParams(), 1, 1000)).list.forEach(item => {
  299. data.push(item);
  300. });
  301. for (const item of data)
  302. item.detail = (await SeminarContent.getContentDetail(item.id)) as GetContentDetailItem;
  303. generateMarkdownIch(dir, '非遗传习所');
  304. break;
  305. }
  306. case 'old': {
  307. const dir = makeDir('old');
  308. (await CommonContent.getContentList(new GetContentListParams()
  309. .setModelId(17)
  310. .setMainBodyColumnId(312)
  311. , 1, 1000)).list.forEach(item => {
  312. data.push(item);
  313. });
  314. for (const item of data)
  315. item.detail = (await CommonContent.getContentDetail(item.id)) as GetContentDetailItem;
  316. generateMarkdownIch(dir, '老字号');
  317. break;
  318. }
  319. case 'unit': {
  320. const dir = makeDir('unit');
  321. (await UnitContent.getContentList(new GetContentListParams(), 1, 1000)).list.forEach(item => {
  322. data.push(item);
  323. });
  324. for (const item of data)
  325. item.detail = (await UnitContent.getContentDetail(item.id)) as GetContentDetailItem;
  326. generateMarkdownIch(dir, '非遗保护单位');
  327. break;
  328. }
  329. case 'inheritor': {
  330. const dir = makeDir('inheritor');
  331. (await InheritorContent.getContentList(new GetContentListParams(), 1, 1000)).list.forEach(item => {
  332. data.push(item);
  333. });
  334. for (const item of data)
  335. item.detail = (await InheritorContent.getContentDetail(item.id)) as GetContentDetailItem;
  336. generateMarkdownInheritor(dir);
  337. break;
  338. }
  339. case 'artifact': {
  340. const dir = makeDir('artifact');
  341. (await UnmoveableContent.getContentList(new GetContentListParams(), 1, 1000)).list.forEach(item => {
  342. data.push(item);
  343. });
  344. for (const item of data)
  345. item.detail = (await UnmoveableContent.getContentDetail(item.id)) as GetContentDetailItem;
  346. generateMarkdownArtifact(dir);
  347. break;
  348. }
  349. case 'publishToDify': {
  350. const datasetId = argv[3];
  351. const localDir = argv[4];
  352. if (!datasetId || !localDir) {
  353. console.error('请提供 datasetId 和 localDir 参数');
  354. console.error('用法: node ich.js publishToDify <datasetId> <localDir>');
  355. break;
  356. }
  357. await publishToDify(datasetId, localDir);
  358. break;
  359. }
  360. default:
  361. console.log('不支持的类型');
  362. break;
  363. }
  364. }
  365. async function publishToDify(datasetId: string, localDir: string) {
  366. // Dify API 配置
  367. const DIFY_API_KEY = process.env.DIFY_API_KEY || 'dataset-ZELjB79MnbcvCeyjEVMEuTmB';
  368. const DIFY_API_URL = 'http://localhost:8089/v1/datasets';
  369. if (!DIFY_API_KEY) {
  370. console.error('请设置 DIFY_API_KEY 环境变量');
  371. return;
  372. }
  373. if (!fs.existsSync(localDir)) {
  374. console.error(`本地目录不存在: ${localDir}`);
  375. return;
  376. }
  377. console.log(`开始上传文档到 Dify 知识库 (ID: ${datasetId})...`);
  378. console.log(`本地目录: ${localDir}`);
  379. try {
  380. // 读取本地目录中的所有 md 文件
  381. const files = fs.readdirSync(localDir);
  382. const mdFiles = files.filter(file => file.endsWith('.md'));
  383. if (mdFiles.length === 0) {
  384. console.log('目录中没有找到 .md 文件');
  385. return;
  386. }
  387. console.log(`找到 ${mdFiles.length} 个 .md 文件`);
  388. // 逐个上传文件
  389. for (const file of mdFiles) {
  390. console.log(`正在上传: ${file}`);
  391. const filePath = path.join(localDir, file);
  392. const content = fs.readFileSync(filePath, 'utf8');
  393. // 构建请求数据
  394. const requestData = JSON.stringify({
  395. name: file,
  396. text: content,
  397. indexing_technique: 'high_quality',
  398. doc_language: 'zh',
  399. });
  400. // 发送上传请求
  401. const response = await fetch(`${DIFY_API_URL}/${datasetId}/document/create-by-text`, {
  402. method: 'POST',
  403. headers: {
  404. 'Content-Type': 'application/json',
  405. 'Authorization': `Bearer ${DIFY_API_KEY}`
  406. },
  407. body: requestData
  408. });
  409. if (response.ok) {
  410. await response.json();
  411. console.log(`✅ 上传成功: ${file})`);
  412. } else {
  413. const error = await response.json();
  414. console.error(`❌ 上传失败: ${file}`);
  415. console.error(` 错误信息: ${(error as Error).message || response.statusText}`);
  416. }
  417. }
  418. console.log('\n🎉 所有文档上传完成!');
  419. } catch (error) {
  420. console.error('上传过程中发生错误:', error);
  421. }
  422. }
  423. main();