django|python 从docx文件中读取文字和图片,其中图片编码成base64格式(高中信息技术题库系统)

网络上读取docx文本的文章很多,但如何把每一自然段,逐一的存入相应的字典Key:value中;非常繁琐,需要把逻辑理的很清楚。
再一次上我的需求和方案。
django|python 从docx文件中读取文字和图片,其中图片编码成base64格式(高中信息技术题库系统)
文章图片

最终读取后形成如下格式:
[{
"content": "输入一个正整数,输出所有的质因子。如24=2*2*2*3。实现上述功能的Python代码如下:\n\n\n\nn=int(input(″输入一个正整数:″))\n\n\n\ni=2\n\n\n\nwhile______①______:\n\n\n\n if n % i==0:\n\n\n\nn=n/i\n\n\n\nprint(i)\n\n\n\n else:\n\n\n\n______②______\n\n\n\n(1)在程序划线处填入合适的代码。\n\n\n\n(2)按照上述算法,输入60,依次输出的质因子是____________。",
"answer": "(1)①n>1或n! =1 ②i+=1 (2)2 2 3 5",
"explain": "最小的质因数是2,如果能被2整除,则反复相除,当不能被2整除时,将i增加1,尝试被3整除,如果还不能除通,往上增加到4,由于前面反复除2操作,因此不可能被不是质数的数除通。当相除的结果为1时,终止循环。输入60,可以被2除2次,被3除1次,被5除1次。",
"reference": 53,
"type": "填空题",
"difficulty_level": "中级",
"knowledgepoint": 11,
"open_level": "public",
"tags": "while循环",
"top": false
},]

from email import contentmanager import imp import docx from docx.document import Document from docx.text.paragraph import Paragraph from docx.image.image import Image from docx.parts.image import ImagePart from docx.oxml.shape import CT_Picture from PIL import Image from io import BytesIO import sys import base64 import structquestion_number_list = [str(i)+"." for i in range(1,31)] # 建立题型选择器 text_head_filling = "、填空题" text_head_choice = "、选择题" text_head_SQAS = "、简答题" type_dict = {text_head_filling:text_head_filling[1::],text_head_choice:text_head_choice[1::], text_head_SQAS: text_head_SQAS[1::]} def get_picture(document: Document, paragraph:Paragraph): """ document 为文档对象 paragraph 为内嵌图片的某一个段落对象,比如第1段内 """ result_list=[] img_list = paragraph._element.xpath('.//pic:pic') if len(img_list)==0 or not img_list: return for i in range(len(img_list)): img: CT_Picture = img_list[i] embed = img.xpath('.//a:blip/@r:embed')[0] related_part: ImagePart = document.part.related_parts[embed] image: Image = related_part.image result_list.append(image) return result_listdef get_content(paragraph:Paragraph): """ paragraph 为内嵌图片的某一个段落对象,比如第1段内 return 字典形式的题目比如[{“content”:"按照二叉树的定义,具有 3个节点的二叉树形态有( )A.3种B.4种C.5种D.6种","answer":"C", "explain":"略","type":"选择题"} """ new_key = ""#存放当前的字段名; new_type = ""#存放新题型 new_question = False#存放当前自然段是否是新题目的开始# 设置题型 paragraph_text = paragraph.text # 去掉空行 if not paragraph_text.strip(): return False,"","","" # 去掉插图的文字提示 if paragraph_text.strip()[:1:]=="第": if paragraph_text.strip()[3:5]=="题图" or paragraph_text.strip()[2:4]=="题图": return False,"","","" # 设置内容 content = "" content_start = 0 answer="" # 如果当前是题型的开头,即一、选择题.二、填空题或者三、简答题.等开头的,则跳过并设置题目内容的开始位置; if paragraph_text[1:5:] in type_dict.keys(): temp_type = type_dict[paragraph_text[1:5:]] return False,temp_type,"","" # 如果当前是题目的首行,即1.2.等开头的,则跳过并设置题目内容的开始位置; position = paragraph_text.find(".") if position >0: if paragraph_text[position-1:position+1:1] in question_number_list or paragraph_text[position-2:position+1:1] in question_number_list: content_start = position+1 new_question = True new_key = "content" # 当前段落是答案开头,则取该段落除【答案】或者【解析】字样 position_answer = paragraph_text.strip().find("【答案】") position_explain = paragraph_text.strip().find("【解析】") # 该段为答案段 if position_answer>=0: new_key = "answer" content_start= position_answer+5 # 该段为解释 elif position_explain>=0: new_key = "explain" content_start = position_explain+4content = paragraph_text[content_start::] return new_question,new_type,new_key,contentdef ReadDocx2List(d : Document): start_row = 0 paragraph_text="" current_key = "content" current_type = "选择题"#当前的题型 questions_list = []#存放所有题目 new_question = False question_dict={"content":"","answer":"","explain":"","type":"","pictures":""} data_list=list() first = True #从第一题开始收集题目 for start_row in range(len(d.paragraphs)): paragraph = d.paragraphs[start_row] if paragraph.text.strip()[1:5:] in type_dict.keys(): break # 读取图片 for i in range(start_row,len(d.paragraphs)): paragraph = d.paragraphs[i] image_list = get_picture(d, paragraph) if image_list: for image in image_list: if image: # 后缀 ext = image.ext # 二进制内容 blob = image.blob # 显示图片 Image.open(BytesIO(blob)).show() img_stream = base64.b64encode(blob) bs64 = "data:image/jpeg; base64," + img_stream.decode('utf-8') if question_dict["pictures"] == "": question_dict["pictures"] = bs64 else: question_dict["pictures"] += "-" + bs64 print(bs64) continue# 获取所有的文本内容 new_question,temp_type,temp_key,result_text = get_content(paragraph) print(result_text) #题型的转换,只需要更改题型的key,不需要做做其他的任何操作 if temp_type !="": #设置在此行以后的题型 current_type = temp_type current_key="content"#初始化从内容开始存放,可能跟新题首行的设置重复 continue else: #有新的key,即有新内容 if temp_key != "": #如果是题目的首行,即出现新题目,则分为第1个题目,还是非第1题 if new_question: # 如果是第一题,则只需要把该段文本存入content字段,非设置变量first为非第一题 if first: first = False question_dict['type'] = current_type question_dict[current_key] = result_text + "\n" # 当前非第1题,先结算上一题到questions_list中,再初始化题目字典,并把当前行文本添加到content字段中 else: questions_list.append(question_dict) question_dict={"content":"","answer":"","explain":"","type":current_type,"pictures":""} current_key="content"#初始化从内容开始存放 question_dict[current_key] += result_text + "\n" # 非题目的首段,且有新的key,比如遇到答案或解析,设置新的key,并把内容添加到给字典key的value中 else: current_key=temp_key question_dict[current_key] +=result_text+ "\n" # 没有新的key,且有内容,意味着是当前Key的换行,比如题目多自然段,答案或者解析的换行多自然段 elif result_text !="": question_dict[current_key] +=result_text+ "\n" questions_list.append(question_dict) return questions_list # print(questions_list) if __name__ =="__main__": d = docx.Document('test.docx') data_list = ReadDocx2List(d) print(data_list)

【django|python 从docx文件中读取文字和图片,其中图片编码成base64格式(高中信息技术题库系统)】代码中的注释比较详细了。有疑问的请请留言,互相探讨

    推荐阅读