1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
| import os import fitz
def analysis(file_path, save_path, num, toimg): file_array = [] if os.path.isdir(file_path): file_count = get_path_file(file_path) for v in file_count: file_array.append(v) else: file_array.append(file_path)
if not file_array: print("此目录下无文件") file_count_num = len(file_array) print("程序运行中,共计%s个文件" % file_count_num) success_num = file_count_num failed_file_array = [] for v in file_array: print("原文件路径:%s" % v) file_name = os.path.basename(v) if '.pdf' not in file_name: print("此文件非PDF文件,跳过") failed_file_array.append("非 PDF文件:" + file_name) success_num = success_num - 1 continue doc = fitz.open(v) count_page = doc.page_count if toimg == False: if count_page > 1: doc2 = fitz.open() doc2.insert_pdf(doc, to_page=num) p_1 = v.replace(file_path, save_path) p_2 = p_1.replace(file_name, '') if not os.path.exists(p_2): os.makedirs(p_2) print("提取到路径:" + p_2 + file_name) doc2.save(p_2 + file_name) print("提取完成") else: print("此文档无内容,跳过") failed_file_array.append("文件无内容:" + file_name) success_num = success_num - 1 continue else: if count_page > 1: page = doc[num] rotate = int(0) zoom_x = 2.0 zoom_y = 2.0 trans = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate) pm = page.get_pixmap(matrix=trans, alpha=False) p_1 = v.replace(file_path, save_path) p_2 = p_1.replace(file_name, '') if not os.path.exists(p_2): os.makedirs(p_2)
new_file_name = file_name.replace(".pdf", "") print("提取到路径:" + p_2 + '%s.png' % new_file_name) pm.save(p_2 + '%s.png' % new_file_name) print("提取并转换为图片完成") else: print("此文档无内容,跳过") failed_file_array.append(file_name) success_num = success_num - 1 continue print("\n\n合计 %d 个文件提取成功,以下文件提取失败:" % success_num) for f in failed_file_array: print(f)
def get_path_file(files_path): data = [] for root, dirs, files in os.walk(files_path, topdown=False): for name in files: f_p = os.path.join(root, name).replace("\\", "/") data.append(f_p) return data
def uni_path(path: str) -> str: return path.replace("\\\\", "/").replace("\\", "/")
if __name__ == '__main__': print("|---------------------------------|") print("|++++++++ PDF处理工具箱 ++++++++|") print("|---------------------------------|") print("| |") print("| 1. PDF 批量提取首页 |") print("| 2. PDF 批量提取首页并转换为图片 |") print("| |") print("|---------------------------------|")
toimg = False choice = input("请输入要执行操作的编号:") if choice == '1': toimg = False elif choice == '2': toimg = True now_path = os.getcwd() print("当前位置:%s" % now_path) print("请输入参数,以 / 结尾") save_path = input("提取文件保存地址:") save_path_status = os.path.exists(save_path) if not save_path_status: os.mkdir(save_path) num = 0 file_path = input("待处理PDF文件地址:")
analysis(uni_path(file_path), uni_path(save_path), num, toimg)
|