Convert documents for RAG pipelines Leave feedback

import re
from groupdocs.markdown import MarkdownConverter, ConvertOptions, SkipImagesStrategy, MarkdownFlavor

def convert_for_rag():
    """Convert a PDF to Markdown for RAG pipelines, then split into chunks by heading."""

    # Step 1: Configure conversion for text-only RAG (skip images)
    options = ConvertOptions()
    options.image_export_strategy = SkipImagesStrategy()
    options.flavor = MarkdownFlavor.COMMON_MARK

    # Step 2: Convert the document using keyword argument for options
    markdown = MarkdownConverter.to_markdown("business-plan.pdf", convert_options=options)

    # Step 3: Split the Markdown into chunks by heading markers
    chunks = re.split(r"\n#{1,2} ", markdown)

    # Step 4: Process each chunk (e.g., send to an embedding model)
    for chunk in chunks:
        if chunk.strip():
            print(f"Chunk ({len(chunk)} chars): {chunk[:80]}...")

if __name__ == "__main__":
    convert_for_rag()

business-plan.pdf

business-plan.pdf is sample file used in this example. Click here to download it.

convert-rag.txt

Chunk (36184 chars): 



**HOME**** ****BASED**** ****PROFESSIONAL**** **



**SERVICES**** **



***...

Download full output

convert-rag.txt

Chunk (34602 chars): 



**HOME BASED PROFESSIONAL SERVICES *Business Plan* TABLE OF CONTENTS** 



I...

Download full output

Batch processing a document library

batch_convert_for_rag.py

import os
import glob
from groupdocs.markdown import MarkdownConverter, ConvertOptions, SkipImagesStrategy, GroupDocsMarkdownException

def batch_convert_for_rag():
    """Batch-convert all PDFs in a folder to Markdown for RAG ingestion."""

    # Step 1: Configure conversion to skip images (text-only RAG)
    options = ConvertOptions()
    options.image_export_strategy = SkipImagesStrategy()

    # Step 2: Find all PDF files in the documents folder
    files = glob.glob("documents/*.pdf")

    # Step 3: Convert each file, handling errors gracefully
    for file in files:
        try:
            markdown = MarkdownConverter.to_markdown(file, convert_options=options)
            output_path = os.path.splitext(file)[0] + ".md"
            with open(output_path, "w", encoding="utf-8") as f:
                f.write(markdown)
            print(f"Converted: {file}")
        except GroupDocsMarkdownException as ex:
            print(f"Skipped {file}: {ex}")

if __name__ == "__main__":
    batch_convert_for_rag()

batch-convert-rag.txt

Converted: documents\business-plan.pdf

Download full output

batch-convert-rag.zip

documents/business-plan.txt (32 KB)

Download full output

We value your opinion. Your feedback will help us improve our documentation.

Convert documents for RAG pipelines Leave feedback

On this page

Basic conversion for RAG

Batch processing a document library

Was this page helpful?

Any additional feedback you'd like to share with us?

Please tell us how we can improve this page.

Thank you for your feedback!

On this page