使用Python去除PDF水印：一步一步的实现（二）

见贤思齐 · 发表于 2024-9-10 20:24:33

前言在日常工作中，我们经常会遇到带有水印的PDF文件。水印可能会影响文档的可读性或者在打印时造成不便。为了自动化地去除PDF文件中的水印，我们可以使用Python及其强大的库，如PyMuPDF、Pillow和OpenCV。本文将详细介绍如何实现这一过程。所需库首先，我们需要安装以下Python库：PyMuPDF：用于处理PDF文件。Pillow：用于图像处理。OpenCV：用于图像操作。使用以下命令安装这些库：pipinstallpymupdfpillowopencv-python-headless实现步骤导入必要的库我们首先导入所需的库：importfitz#PyMuPDFfromPILimportImageimportnumpyasnpimportcv2importosimportconcurrent.futures定义去除水印的函数接下来，我们定义一个函数来去除图像中的水印。该函数利用颜色范围创建掩码，并将掩码区域设置为白色：defremove_watermark(image,lower_bound,upper_bound):"""去除水印"""#将PIL图像转换为OpenCV格式open_cv_image=np.array(image)open_cv_image=cv2.cvtColor(open_cv_image,cv2.COLOR_RGB2BGR)#创建掩码，查找在指定颜色范围内的像素mask=cv2.inRange(open_cv_image,lower_bound,upper_bound)#使用膨胀和侵蚀操作优化掩码kernel=np.ones((3,3),np.uint8)mask=cv2.dilate(mask,kernel,iterations=1)mask=cv2.erode(mask,kernel,iterations=1)#将掩码范围内的像素设为白色open_cv_image[mask!=0]=[255,255,255]#将图像转换回PIL格式returnImage.fromarray(cv2.cvtColor(open_cv_image,cv2.COLOR_BGR2RGB))处理单页PDF的函数我们需要一个函数来处理PDF文件的每一页，将其转换为图像，去除水印后保存为PNG文件：defprocess_page(pdf_path,page_num,output_folder,mat,lower_bound,upper_bound):"""处理单页"""try:#打开PDF文件并加载特定页pdf=fitz.open(pdf_path)page=pdf.load_page(page_num)pixmap=page.get_pixmap(matrix=mat)img=Image.frombytes("RGB",[pixmap.width,pixmap.height],pixmap.samples)#调用remove_watermark函数去除水印img=remove_watermark(img,lower_bound,upper_bound)#将处理后的图像保存为PNG文件img_path=os.path.join(output_folder,f"{page_num}.png")img.save(img_path,format="PNG")print(f"第{page_num}页水印去除完成")pdf.close()exceptfitz.FileDataError:print(f"无法读取第{page_num}页的数据。")exceptfitz.PDFPageError:print(f"第{page_num}页无法加载。")exceptExceptionase:print(f"处理第{page_num}页时出错:{e}")主函数主函数remove_pdf负责管理整个PDF的处理过程，包括创建输出文件夹、并行处理每一页以及将处理后的图像合并回PDF：defremove_pdf(pdf_file,output_folder,output_pdf_path,dpi=1800,lower_bound=(168,168,168),upper_bound=(172,172,172)):"""去除PDF水印的主函数"""ifnotos.path.exists(pdf_file):print(f"文件{pdf_file}未找到。")returnifnotpdf_file.lower().endswith('.pdf'):print(f"文件{pdf_file}不是PDF文件。")returnifnotos.path.exists(output_folder)

s.makedirs(output_folder)zoom=dpi/72mat=fitz.Matrix(zoom,zoom)try:pdf=fitz.open(pdf_file)total_pages=len(pdf)pdf.close()withconcurrent.futures.ThreadPoolExecutor()asexecutor:futures=[executor.submit(process_page,pdf_file,page_num,output_folder,mat,lower_bound,upper_bound)forpage_numinrange(total_pages)]forfutureinconcurrent.futures.as_completed(futures):try:future.result()exceptExceptionase:print(f"处理过程中发生错误:{e}")image_files=[os.path.join(output_folder,f"{page_num}.png")forpage_numinrange(total_pages)]image_list=[Image.open(img_file).convert("RGB")forimg_fileinimage_files]ifimage_list:image_list[0].save(output_pdf_path,save_all=True,append_images=image_list[1:])print(f"处理后的PDF文件保存为:{output_pdf_path}")else:print("没有处理好的图片可以合并为PDF。")exceptFileNotFoundError:print(f"文件{pdf_file}未找到。")exceptfitz.FileDataError:print(f"无法读取文件数据{pdf_file}。")exceptExceptionase:print(f"处理PDF文件时出错:{e}")主程序入口在主程序中，我们从用户获取输入路径、输出路径以及水印颜色的上下界，然后调用remove_pdf函数：if__name__=="__main__":pdf_path=input("请输入PDF地址：")output_path=input("请输入保存处理后的图片的文件夹地址：")output_pdf_path=input("请输入保存处理后的PDF地址：")lower_bound=tuple(map(int,input("请输入水印颜色的下界（例如：168,168,168）：").split(',')))upper_bound=tuple(map(int,input("请输入水印颜色的上界（例如：172,172,172）：").split(',')))remove_pdf(pdf_path,output_path,output_pdf_path,lower_bound=lower_bound,upper_bound=upper_bound)代码解析以下是本文中使用的完整代码，添加了中文注释，帮助理解每个步骤的功能：importfitz#PyMuPDFfromPILimportImageimportnumpyasnpimportcv2importosimportconcurrent.futuresdefremove_watermark(image,lower_bound,upper_bound):"""去除水印"""#将PIL图像转换为OpenCV格式open_cv_image=np.array(image)open_cv_image=cv2.cvtColor(open_cv_image,cv2.COLOR_RGB2BGR)#创建掩码，查找在指定颜色范围内的像素mask=cv2.inRange(open_cv_image,lower_bound,upper_bound)#使用膨胀和侵蚀操作优化掩码kernel=np.ones((3,3),np.uint8)mask=cv2.dilate(mask,kernel,iterations=1)mask=cv2.erode(mask,kernel,iterations=1)#将掩码范围内的像素设为白色open_cv_image[mask!=0]=[255,255,255]#将图像转换回PIL格式returnImage.fromarray(cv2.cvtColor(open_cv_image,cv2.COLOR_BGR2RGB))defprocess_page(pdf_path,page_num,output_folder,mat,lower_bound,upper_bound):"""处理单页"""try:#打开PDF文件并加载特定页pdf=fitz.open(pdf_path)page=pdf.load_page(page_num)pixmap=page.get_pixmap(matrix=mat)img=Image.frombytes("RGB",[pixmap.width,pixmap.height],pixmap.samples)#调用remove_watermark函数去除水印img=remove_watermark(img,lower_bound,upper_bound)#将处理后的图像保存为PNG文件img_path=os.path.join(output_folder,f"{page_num}.png")img.save(img_path,format="PNG")print(f"第{page_num}页水印去除完成")pdf.close()exceptfitz.FileDataError:print(f"无法读取第{page_num}页的数据。")exceptfitz.PDFPageError:print(f"第{page_num}页无法加载。")exceptExceptionase:print(f"处理第{page_num}页时出错:{e}")defremove_pdf(pdf_file,output_folder,output_pdf_path,dpi=1800,lower_bound=(168,168,168),upper_bound=(172,172,172)):"""去除PDF水印的主函数"""ifnotos.path.exists(pdf_file):print(f"文件{pdf_file}未找到。")returnifnotpdf_file.lower().endswith('.pdf'):print(f"文件{pdf_file}不是PDF文件。")returnifnotos.path.exists(output_folder)

s.makedirs(output_folder)zoom=dpi/72mat=fitz.Matrix(zoom,zoom)try:pdf=fitz.open(pdf_file)total_pages=len(pdf)pdf.close()withconcurrent.futures.ThreadPoolExecutor()asexecutor:futures=[executor.submit(process_page,pdf_file,page_num,output_folder,mat,lower_bound,upper_bound)forpage_numinrange(total_pages)]forfutureinconcurrent.futures.as_completed(futures):try:future.result()exceptExceptionase:print(f"处理过程中发生错误:{e}")image_files=[os.path.join(output_folder,f"{page_num}.png")forpage_numinrange(total_pages)]image_list=[Image.open(img_file).convert("RGB")forimg_fileinimage_files]ifimage_list:image_list[0].save(output_pdf_path,save_all=True,append_images=image_list[1:])print(f"处理后的PDF文件保存为:{output_pdf_path}")else:print("没有处理好的图片可以合并为PDF。")exceptFileNotFoundError:print(f"文件{pdf_file}未找到。")exceptfitz.FileDataError:print(f"无法读取文件数据{pdf_file}。")exceptExceptionase:print(f"处理PDF文件时出错:{e}")if__name__=="__main__":pdf_path=input("请输入PDF地址：")output_path=input("请输入保存处理后的图片的文件夹地址：")output_pdf_path=input("请输入保存处理后的PDF地址：")lower_bound=tuple(map(int,input("请输入水印颜色的下界（例如：168,168,168）：").split(',')))upper_bound=tuple(map(int,input("请输入水印颜色的上界（例如：172,172,172）：").split(',')))remove_pdf(pdf_path,output_path,output_pdf_path,lower_bound=lower_bound,upper_bound=upper_bound)通过上面的代码，我们可以实现自动去除PDF水印的功能。如果你在实现过程中遇到问题或有更好的改进建议，欢迎在评论区分享。让我们一起学习进步！对比前后去除前去除后总结本文介绍了如何使用Python去除PDF文件中的水印。我们通过结合使用PyMuPDF、Pillow和OpenCV库，实现了从PDF提取页面、处理图像并去除水印的完整流程。该方法自动化程度高，适用于批量处理带有水印的PDF文件。希望这篇文章能对你有所帮助！如果你在实现过程中遇到问题或有更好的改进建议，欢迎在评论区分享。让我们一起学习进步！参考资料PyMuPDF官方文档：链接OpenCV官方文档：链接Python官方文档：链接

		自动登录	找回密码
密码			会员注册