Procházet zdrojové kódy

pdf and markdown loader

WangKang před 7 měsíci
rodič
revize
57d03159c2

+ 8 - 0
ruoyi-modules/ruoyi-system/pom.xml

@@ -138,6 +138,14 @@
             <groupId>org.springframework.boot</groupId>
             <artifactId>spring-boot-starter-test</artifactId>
         </dependency>
+        <dependency>
+            <groupId>org.springframework.ai</groupId>
+            <artifactId>spring-ai-pdf-document-reader</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.springframework.ai</groupId>
+            <artifactId>spring-ai-markdown-document-reader</artifactId>
+        </dependency>
     </dependencies>
 
 </project>

+ 42 - 0
ruoyi-modules/ruoyi-system/src/main/java/org/dromara/system/ai/loader/MarkdownResourceLoader.java

@@ -0,0 +1,42 @@
+package org.dromara.system.ai.loader;
+
+import org.dromara.system.ai.domain.bo.ResourceSplitBo;
+import org.springframework.ai.document.Document;
+import org.springframework.ai.reader.markdown.MarkdownDocumentReader;
+import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig;
+import org.springframework.ai.transformer.splitter.TokenTextSplitter;
+import org.springframework.core.io.Resource;
+
+import java.util.List;
+
+/**
+ * @author destiny
+ * @description:
+ * @date 2025/9/23 17:05
+ */
+public class MarkdownResourceLoader implements ResourceLoader {
+    @Override
+    public List<Document> getDocuments(Resource resource) {
+        MarkdownDocumentReader markdownDocumentReader = new MarkdownDocumentReader(resource,
+            MarkdownDocumentReaderConfig.builder()
+                .withHorizontalRuleCreateDocument(false)     // 是否将分割线作为文档分隔符
+                .withIncludeCodeBlock(false)                // 是否包含代码块作为独立文档
+                .withIncludeBlockquote(false)               // 是否包含引用块作为独立文档
+                .withAdditionalMetadata("filename", resource.getFilename()) // 添加文件名元数据
+                .build());
+        return markdownDocumentReader.read();
+    }
+
+    /**
+     * 获取文档列表
+     *
+     * @param resource        资源
+     * @param resourceSplitBo 知识切分参数
+     * @return
+     */
+    @Override
+    public List<Document> getSplitDocuments(Resource resource, ResourceSplitBo resourceSplitBo) {
+        TokenTextSplitter tokenTextSplitter = new TokenTextSplitter();
+        return tokenTextSplitter.apply(getDocuments(resource));
+    }
+}

+ 44 - 0
ruoyi-modules/ruoyi-system/src/main/java/org/dromara/system/ai/loader/PdfResourceLoader.java

@@ -0,0 +1,44 @@
+package org.dromara.system.ai.loader;
+
+import org.dromara.system.ai.domain.bo.ResourceSplitBo;
+import org.springframework.ai.document.Document;
+import org.springframework.ai.reader.ExtractedTextFormatter;
+import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
+import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
+import org.springframework.ai.transformer.splitter.TokenTextSplitter;
+import org.springframework.core.io.Resource;
+
+import java.util.List;
+
+/**
+ * @author destiny
+ * @description:
+ * @date 2025/9/23 16:42
+ */
+public class PdfResourceLoader implements ResourceLoader {
+    @Override
+    public List<Document> getDocuments(Resource resource) {
+        PagePdfDocumentReader pagePdfDocumentReader = new PagePdfDocumentReader(resource,
+            PdfDocumentReaderConfig.builder()
+                .withPageTopMargin(0)
+                .withPageExtractedTextFormatter(ExtractedTextFormatter.builder()
+                    .withNumberOfTopTextLinesToDelete(0)
+                    .build())
+                .withPagesPerDocument(1)
+                .build());
+        return pagePdfDocumentReader.read();
+    }
+
+    /**
+     * 获取文档列表
+     *
+     * @param resource        资源
+     * @param resourceSplitBo 知识切分参数
+     * @return
+     */
+    @Override
+    public List<Document> getSplitDocuments(Resource resource, ResourceSplitBo resourceSplitBo) {
+        TokenTextSplitter tokenTextSplitter = new TokenTextSplitter();
+        return tokenTextSplitter.apply(getDocuments(resource));
+    }
+}

+ 4 - 0
ruoyi-modules/ruoyi-system/src/main/java/org/dromara/system/ai/loader/ResourceLoaderFactory.java

@@ -16,6 +16,10 @@ public class ResourceLoaderFactory {
             return new TextResourceLoader();
         } else if (FileType.isWord(fileType)) {
             return new WordResourceLoader();
+        } else if (FileType.isPdf(fileType)) {
+            return new PdfResourceLoader();
+        } else if (FileType.isMdFile(fileType)) {
+            return new MarkdownResourceLoader();
         }
         return null;
     }

+ 9 - 5
ruoyi-modules/ruoyi-system/src/main/java/org/dromara/system/ai/splitter/CustomWordSplitter.java

@@ -25,10 +25,14 @@ public class CustomWordSplitter extends TextSplitter {
 
     /**
      * 对文本进行分片处理(指定块大小和重叠大小)
+     * 1. 空文本检查:如果输入文本为空或空白,则返回空列表
+     * 2. 小文本处理:如果文本长度小于 chunkSize,直接作为一个块返回
+     * 3. 段落分割:按换行符 \n 将文本分割成段落数组
      *
      * @param text      待分片的文本
-     * @param chunkSize 块大小
-     * @param overlap   重叠大小
+     * @param chunkSize 每个文本块的最大字符长度,控制分片后每个文本段的大小(默认 1000)
+     * @param overlap   代表相邻文本块之间的重叠字符数,用于确保语义连续性
+     *                  避免重要信息在切分点处被割裂(默认 200)
      * @return 分片后的文本列表
      */
     public List<String> splitText(String text, int chunkSize, int overlap) {
@@ -54,7 +58,7 @@ public class CustomWordSplitter extends TextSplitter {
 
         for (String paragraph : paragraphs) {
             // 如果当前块加上新段落超过限制,且当前块不为空
-            if (currentChunk.length() + paragraph.length() > chunkSize && currentChunk.length() > 0) {
+            if (currentChunk.length() + paragraph.length() > chunkSize && !currentChunk.isEmpty()) {
                 // 保存当前块
                 chunks.add(currentChunk.toString().trim());
 
@@ -68,14 +72,14 @@ public class CustomWordSplitter extends TextSplitter {
             }
 
             // 添加段落到当前块
-            if (currentChunk.length() > 0) {
+            if (!currentChunk.isEmpty()) {
                 currentChunk.append("\n");
             }
             currentChunk.append(paragraph);
         }
 
         // 添加最后一个块
-        if (currentChunk.length() > 0) {
+        if (!currentChunk.isEmpty()) {
             chunks.add(currentChunk.toString().trim());
         }