java解析doc文件

依賴
<dependency>
　　<groupId>org.jsoup</groupId>
　　<artifactId>jsoup</artifactId>
　　<version>1.11.2</version>
</dependency>
<dependency>
　　<groupId>com.aspose</groupId>
　　<artifactId>aspose</artifactId?
　　<version>15.8.0<version>
</dependency>

public class Word{
　　public TitleTreeVO wordAnalysis(MultipartFile multipartFile) throws IOException{
　　　　byte[] byteArr = multipartFile.getBytes();
　　　　InputStream inputStream = new ByteArrayInputStream(byteArr);
　　　　List<DocumentContentVO> documentContentVOList = new LinkedList<>();
　　　　TitleTreeVO titleTreeVO = new TitleTreeVO();
　　　　try{
　　　　　　//
　　　　　　com.aspose.words.Document doc = new com.aspose.words.Document(inputStream);
　　　　　　//設置轉化的格式，HtmlSaveOptions轉換為HTML格式；
　　　　　　HtmlSaveOptions saveOptions = new HtmlSaveOptions();
　　　　　　saveOptions.setExportImagesAsBase64(false);
　　　　　　//將所有word中的圖片放在臨時文件夾中，并將html中的鏈接替換為臨時文件夾中絕對路徑
　　　　　　String property = System.getProperty("java.io.tmpdir");
　　　　　　saveOptions.setImagesFolder(property);
　　　　　　org.apache.commons.io.output.ByteArrayOutputStream baos = new ByteArrayOutputStream();
　　　　　　doc.save(baos, saveOptions);
　　　　　　String token - DmeTestRequestUtil.getToken();
　　　　　　//將html文件轉化為Document，方便后續使用jsoup的操作
　　　　　　org.jsoup.nodes.DocumenthtmlDoc = jsoup.parse(baos.toString());
　　　　　　//設置html中的圖片src路徑
　　　　　　this.setImagePath(htmlDoc, token);
　　　　　　//存儲word文檔的名稱
　　　　　　String subString = multipartFile.getOriginalFilename().substring(0,multipartFile.getOriginalFilename().lastIndexof("."));
　　　　　　JSONObject docPram = this.getDocParam(substring);
　　　　　　String saveDocUrl = "https://dme.cn-south-4.huaweicloud.com/rdm_hwdmeverify_app/publicservices/api/DocumentSave/create";
　　　　　　//首先根據文檔名稱生成第一條document的數據，產生的id將在標題實體中進行關聯
　　　　　　String dmeResult = DmeTestRequestUtil.getDmeResult(saveDocUrl,docPram,token);
　　　　　　JSONObject jsonObject1 = JSONObject.parseObject(dmeResult);
　　　　　　List data1 = jsonObject1.getObject("data", List.class);
　　　　　　JSONObject jsonObjectData1 = (JSONObject)data1.get(0);
　　　　　　String id = jsonObjectData1.getString("id");//文檔id
　　　　　　//存儲文檔的第一個標題的返回結果，其中包含該節點的id和title
　　　　　　documentContentVOList = this.exactContentFromHtml(htmlDoc);
　　　　　　this.dmeSave(documentContentVOList, id, "0", token);//第一個標題的父ID默認為0
　　　　} catch （Exception e）{
　　　　　　e.printStackTrace();
　　　　} finally {
　　　　　　inputStream.close();
　　　　}
　　　　return titleTreeVO;
　　　　}
　　}

　　//在解析為html文件的時候需要將圖片的地址進行一個替換，由最初的臨時文件地址替換為圖片
　　//設置圖片的路徑（src）
　　private void setImagePath(Document document) throws IOException{
　　　　Element imgs = document.select("img");
　　　　String token = DmeTestRequestUtil.getToken();
　　　　for (Element img : imgs){
　　　　　　//獲取出html中src內的地址值
　　　　　　String src = img.attr("src");
　　　　　　//通過地址查到對應的文件
　　　　　　File file = new File(src);
　　　　　　FileInputStream input = new FileInputStream(file);
　　　　　　//將file轉化為MultipartFile
　　　　　　MultipartFile multipartFile = new MockMultipartFile("file",file.getName(), "text/plain", IOUtils.toByteArray(input));
　　　　　　//該部分主要是第三方接口設置的必須傳的參數，在這里就先設置為定值，因為這些不干擾需求結果
　　　　　　FormVo formVo = new FormVo();
　　　　　　formVo.setAttributeName("File");
　　　　　　formVo.setModelName("Document");
　　　　　　formVo.setApplicationId("-1");
　　　　　　String uploadImgUrl = "圖片作為文件，進行上傳";
　　　　　　String uploadImage = DmeTestRequestUtil.getDmeResultUUploadFile(uploadImgUrl, multipartFile, formVo, token);
　　　　　　JSONObject uploadImgJs = JSONObject.parseObject(uploadImage);
　　　　　　List data = uploadImgJs = JSONObject("data", List.class);
　　　　　　//上傳完成后，第三方接口會返回一個文件的id，可以根據這個id進行文件的預覽和下載
　　　　　　String id = (String)data.get(0);
　　　　　　//上傳文件file并返回上傳的路徑，將路徑拼接出來，并替換到html的document中
　　　　　　String imgPath = "/api/dme-library/LibraryFolder/preview?fileId="+id;
　　　　　　img.attr("src", imgPath);
　　　　　　input.close();
　　　　　　//刪除臨時文件夾中存儲的文件
　　　　　　file.deleteOnExit();
　　　　}
　　}

　　//該部分主要是第三方接口在調用時規定該接口的參數格式
　　//拼接參數
　　private JSONObject getDocParam(String name) {
　　　　Map<String, Object> mapStr = new HashMap<>();
　　　　Map<String, Object> paramMap = new HashMap<>();
　　　　paramMap.put("name", name);
　　　　mapStr.put("params", paramMap);
　　　　JSONObject jsonObject = new JSONObject(mapStr);
　　　　return jsonObject;
　　}

　　//處理樹形結構
　　private List<DocumentContentVO> exactContentFromHtml(Document htmlDoc) throws Exception{
　　　　Element eleList = htmlDoc.getElementsByTag("h1");
　　　　if(eleList == null || eleList.size() == 0){
　　　　　　throw new Exception("上傳的文件中不存在一級標題，請檢查");
　　　　}
　　　　Element hElement = htmlDoc.selectFirst("h1");//從第一個標題1 開始往下找
　　　　List<DocumentContentVO> allTreeList = new ArrayList<>();
　　　　List<DocumentContentVO> list2 = new ArrayList<>();
　　　　List<DocumentContentVO> list3 = new ArrayList<>();
　　　　List<DocumentContentVO> list4 = new ArrayList<>();
　　　　DocumentContentVO b1Map = new DocumentContentVO();
　　　　DocumentContentVO b2Map = new DocumentContentVO();
　　　　DocumentContentVO b3Map = new DocumentContentVO();
　　　　DocumentContentVO b4Map = new DocumentContentVO();
　　　　DocumentContentVO bMap = b1Map;
　　　　int i = 1;
　　　　b1Map.setTitle(hElement.toString());
　　　　b1Map.setIndex(i);
　　　　allTreeList.add(b1Map);
　　　　while (hElement.nextElementSibling() != null){
　　　　　　i++;
　　　　　　hElement = hElement.nextElementSibling();
　　　　　　String nodeName = hElement.nodeName();
　　　　　　String s = hElement.tagName();
　　　　　　if(Objects.equals(nodeName, "h1")){
　　　　　　　　b1Map = new DocumentContentVO();
　　　　　　　　bMap = b1Map;
　　　　　　　　b1Map.setTitle(hElement.toString());
　　　　　　　　b1Map.setIndex(i);
　　　　　　　　allTreeList.add(b1Map);
　　　　　　　　list2 = new ArrayList<>();
　　　　　　} else if (Objects.equals(nodeName, "h2")){
　　　　　　　　b2Map = new DocumentContentVO();
　　　　　　　　bMap = b2Map;
　　　　　　　　list3 = new ArrayList<>();
　　　　　　　　b2Map.setTitle(hElement.toString());
　　　　　　　　b2Map.setIndex(i);
　　　　　　　　list2.add(b2Map);
　　　　　　　　b1Map.setChildList(list2);
　　　　　　} else if (Objects.equals(nodeName, "h3")){
　　　　　　　　b3Map = new DocumentContentVO();
　　　　　　　　bMap = b3Map;
　　　　　　　　b3Map.setTitle(hElement.toString());
　　　　　　　　b3Map.setIndex(i);
　　　　　　　　list3.add(b3Map);
　　　　　　　　b2Map.setChildList(list3);
　　　　　　} else if(Objects.equals(nodeName, "h4")){
　　　　　　　　b4Map = new DocumentContentVO();
　　　　　　　　bMap = b4Map;
　　　　　　　　b4Map.setTitle(hElement.toString());
　　　　　　　　b4Map.setIndex(i);
　　　　　　　　list4.add(b4Map);
　　　　　　　　b3Map.setChildList(list4);
　　　　　　} else {
　　　　　　　　bMap.setContent(bMap.getContent() == null ? hElement.toString() : bMap.getContent() + hElement.toString());

　　　　　　}
　　　　}
　　　　return allTreeList;
　　}

　　//傳入html解析的樹和對應文檔id 通過遞歸實現保存
　　private String dmeSave(List<DocumentContentVO> treeList, String id , String parentId, String token){
　　　　String dmeResult = null;
　　　　for(DocumentContentVO documentContentVO : treeList) {
　　　　　　if(documentContentVO != null){
　　　　　　　　String title = documentContentVO.getTitle();
　　　　　　　　int sort = documentContentVO.getIndex();
　　　　　　　　String content = documentContentVO.getContent();
　　　　　　　　String url = "創建對應數據的第三方url";
　　　　　　　　JSONObject jsonObjectParam = this.paramJoin1(title, id, parentId, sort, content)
　　　　　　　　dmeResult = DmeTestRequestUtil.getDmeResult(url, jsonObjectParam, token);
　　　　　　　　List data = JSONObject.parseObject(dmeResult).getObject("data", List.class);
　　　　　　　　if(data != null && !data.isEmpty()){
　　　　　　　　　　JSONObject jsonObject = (JSONObject)data.get(0);
　　　　　　　　　　String parentIdNext = jsonObject.getString("id");
　　　　　　　　　　if(documentContentVO.getChildList() != null && documentContentVO.getChildList().size() > 0){
　　　　　　　　　　　　dmeSave(documentContentVO.getChildList(), id , parentIdNext,token);
　　　　　　　　　　}
　　　　　　　　}
　　　　　　}
　　　　}
　　　　return dmeResult;
　　}

　　//第三方接口規定的參數樣式，需要進行拼接
　　private JSONObject paramJoin1(String title, String id, String parentId, int sort, String content) {
　　　　Map<String, Object> mapStr = new HashMap<>();
　　　　Map<String, Object> paramMap = new HashMap<>();
　　　　if(id != null){
　　　　　　paramMap.put("title", title);
　　　　　　paramMap.put("sort", sort);
　　　　　　paramMap.put("parentId",parentId);
　　　　　　paramMap.put("content", content);
　　　　　　paramMap.put("documentId", id);
　　　　} else {
　　　　　　paramMap.put("title", title);
　　　　}
　　　　mapStr.put("params", paramMap);
　　　　return JSONObject.parseObject(JSON.toJSONString(mapStr));
　　}
}

/**
* 根據documentId查詢取該篇文檔的標題內容樹形結構
*/

//根據documentId查詢對應的word文檔的樹形結構
public List<TitleTreeVO> getTreeCon(JSONObject reqJSON) {
　　String id = reqJSON.getString("id");
　　List<TitleTreeVO> allTitleByDocId = this.getAllTitleByDocId(id);
　　TitleTreeVO titleTreeVO = new TitleTreeVO();
　　titleTreeVO.setId("0");
　　this.getChild(titleTreeVO, allTitleByDocId);
　　return titleTreeVO.getChildList();
}

//根據文檔id獲取到該文檔的所有標題，（此時獲取的集合沒有父子級關系）
private List<TitleTreeVO> getAllTitleByDocId(String docId){
　　String url = "第三方標題表的查詢";
　　JSONObject docIdParam = getDocIdParam(docId);
　　String token = DmeTestRequestUtil.getDmeResult(url,docIdParam,token);
　　JSONObject jsonObject = JSONObject.parseObject(dmeResult);
　　List date = jsonObject.getObject("data", List.class);
　　List<TitleTreeVO> titleList = new ArrayList<>();
　　if(data != null && !data.isEmpty()){
　　　　for(Object title : data){
　　　　　　JSONObject titleJson = (JSONObject)title;
　　　　　　TitleTreeVO titleTreeVO = new TitleTreeVO();
　　　　　　titleTreeVO.setContent(titleJson.getString("content"));
　　　　　　titleTreeVO.setTitle(titleJson.getString("title"));
　　　　　　titleTreeVO.setId(titleJson.getString("id"));
　　　　　　titleTreeVO.setIndex(Integer.parseInt(titleJson.getString("sort")));
　　　　　　titleTreeVO.setDocumentId(titleJson.getString("documentId"));
　　　　　　titleTreeVO.setParentId(titleJson.getString("parentId"));
　　　　　　titleList.add(titleTreeVO);
　　　　}
　　}
　　return titleList;
}

//通過遞歸獲取到各級的子標題和內容
private TitleTreeVO getChild(TitleTreeVO parentTitleTreeVO, List<TitleTreeVO> titleListOld) {
　　List<TitleTreeVO> titleList = new ArrayList<>();
　　if(titleListOld != null && titleListOld.size() > 0){
　　　　List<TitleTreeVO> titleCollect = titleListOld.stream().filter(e -> e.getParentId().equals(parentTitleTreeVO.getId())).collect(Collectors.toList());
　　　　if(titleCollect.size() > 0){
　　　　　　for(TitleTreeVO title : titleCollect){
　　　　　　　　TitleTreeVO titleTreeVO = new TitleTreeVO();
　　　　　　　　titleTreeVO.setIndex(title.getIndex());
　　　　　　　　titleTreeVO.setTitle(title.getTitle());
　　　　　　　　titleTreeVO.setId(title.getId());
　　　　　　　　titleTreeVO.setContent(title.getContent());
　　　　　　　　titleTreeVO.setDocumentId(title.getDocumentId());
　　　　　　　　titleTreeVO.setParentId(title.getParentId());
　　　　　　　　titleList.add(titleTreeVO);
　　　　　　　　this.getChild(titleTreeVO, titleListOld);
　　　　　　}
　　　　}
　　}
　　List<TitleTreeVO> titleSortList = titleList.stream().sorted(Comparator.comparing(TitleTreeVO::getIndex)).collect(Collectors.toList());
　　parentTitleTreeVO.setChildList(titleSortList);
　　return parentTitleTreeVO;
}

posted on 2024-12-03 10:38 申輝閱讀(318) 評論(0) 收藏舉報

刷新頁面返回頂部