Extract each component of pdf document using fitz and create a new pdf document which have every thing same as original document - using Python

Read moreI have a pdf document, i am trying to extract all components of the document (page layout, page size, text , text font, text color, text background color, table , table grid lines, table background colour of cell , images present in the document , the area where image is placed etc) and recreate a new pdf document with all such information. I am using pdfplumber & reportlab to get this done. I am not able to recreate the new document which looks exactly similar to my original document. Here is the code to do this :

def get_page_orientation(pdf_path):
    doc = fitz.open(pdf_path)
    first_page = doc[0]
    width, height = first_page.rect.width, first_page.rect.height
    return width, height, "Landscape" if width > height else "Portrait"


def convert_color(int_color):
    """Convert PyMuPDF integer color to RGB format"""
    r = (int_color >> 16) & 255
    g = (int_color >> 8) & 255
    b = int_color & 255
    return r / 255, g / 255, b / 255
def register_font(font_name):
    font_paths = {
        "ArialMT": "Arial.ttf",
        "TimesNewRomanPSMT": "TimesNewRoman.ttf",
        "Helvetica": "Helvetica.ttf",
    }

    if font_name not in pdfmetrics.getRegisteredFontNames():
        font_path = font_paths.get(font_name)
        if font_path and os.path.exists(font_path):
            pdfmetrics.registerFont(TTFont(font_name, font_path))
        else:
            font_name = "Helvetica"

    return font_name
def auto_adjust_font(canvas_obj, text, x, y, width_limit, font_name="Helvetica", max_font_size=12):

    """Ensure text is not None and adjust font size"""
    text = text or ""  # Ensure not None

    font_size = max_font_size
    while font_size > 6 and canvas_obj.stringWidth(text, font_name, font_size) > width_limit:
        font_size -= 1

    canvas_obj.setFont(font_name, font_size)
    canvas_obj.drawString(x, y, text)
def extract_text_and_colors(pdf_path):
    doc = fitz.open(pdf_path)
    extracted_data = []

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        annotations = page.annots() or []

        annotation_bg_colors = list(
            map(
                lambda a: (a.rect, a.colors.get("stroke", (1, 1, 1))),
                filter(lambda a: a.type[0] == 9, annotations)
            )
        )

        page_data = [
            (
                span["bbox"],
                span["text"],
                span["font"],
                span["size"],
                convert_color(span.get("color", 0)),
                next(
                    (
                        color for rect, color in annotation_bg_colors
                        if fitz.Rect(block.get("bbox", None)).intersects(rect)
                    ),
                    (1, 1, 1)  # default white background
                )
            )
            for block in blocks
            for line in block.get("lines", [])
            for span in line["spans"]
        ]

        extracted_data.append(page_data)

    return extracted_data
def translate_text(text, target_language="fr"):
    """Ensure the translated text is always a valid string"""
    if not text or not text.strip():
        return ""
    
    try:
        translator = GoogleTranslator(source="auto", target=target_language)
        return translator.translate(text) or text
    except Exception as e:
        print(f"Translation error: {e}")
        return text
def create_translated_pdf(input_pdf, output_pdf, target_language="fr"):
    text_data = extract_text_and_colors(input_pdf)
    width, height, orientation = get_page_orientation(input_pdf)
    page_size = landscape(letter) if orientation == "Landscape" else portrait(letter)
    pdf_canvas = canvas.Canvas(output_pdf, pagesize=page_size)

    for page_data in text_data:
        elements = filter(lambda el: el[1].strip(), page_data)

        for bbox, text, font_name, font_size, text_color, bg_color in elements:
            font_name = register_font(font_name)
            translated_text = translate_text(text, target_language)

            x, y, x1, y1 = bbox
            y = height - y1  # Flip for PDF coordinate system

            if bg_color != (1, 1, 1):
                pdf_canvas.saveState()
                pdf_canvas.setFillColorRGB(*bg_color)
                pdf_canvas.rect(x, y, x1 - x, abs(y1 - y), fill=1, stroke=0)
                pdf_canvas.restoreState()

            pdf_canvas.setFillColorRGB(*text_color)
            auto_adjust_font(pdf_canvas, translated_text, x, y, x1 - x, font_name, font_size)

        pdf_canvas.showPage()

    pdf_canvas.save()
    print(f"Translated PDF saved as: {output_pdf}")
def translate_pdf(input_pdf, output_pdf, target_language="fr"):
    create_translated_pdf(input_pdf, output_pdf, target_language)

translate_pdf(input_pdf, output_pdf, target_language)

Attached is the results of the code : [Left side of image is original & right side is the newly created translated page of pdf. Here you will see the colors present as background of text is missing in translated file, no tabular grid lines and neither tabular shades are coming in output document.]

Need your expert help in fixing this issue.

Answer

@Jorj X. McKie : Please share your view on this, how can i get all such components extracted from pdf - specially table lines, table cell colour which is present in background of text and how can i recreated the new pdf using extracted features.

Answer

Enjoyed this article?