Batch Processing Examples
This section provides practical examples for batch processing files with OpenConvert.
Document Batch Processing
Convert All Text Files to PDF
# Simple batch conversion
openconvert -i documents/ -o pdfs/ --to application/pdf
# With custom formatting
openconvert -i documents/ -o pdfs/ --to application/pdf \\
--prompt "Professional layout with headers"
# Process specific file pattern
openconvert -i "*.txt" -o converted/ --to application/pdf
Directory Structure Processing
#!/bin/bash
# Process entire directory tree
find /path/to/documents -name "*.txt" -type f | while read file; do
# Create corresponding output path
output_dir="converted/$(dirname "$file")"
mkdir -p "$output_dir"
# Convert file
openconvert -i "$file" -o "$output_dir/$(basename "$file" .txt).pdf"
done
Parallel Processing
# Process multiple files in parallel
ls *.txt | xargs -I {} -P 4 openconvert -i {} -o {}.pdf
# GNU parallel (if available)
parallel openconvert -i {} -o {.}.pdf ::: *.txt
Python Batch Processing
from openconvert import convert_file
from pathlib import Path
import concurrent.futures
from typing import List, Tuple
def batch_convert_documents(
input_dir: str,
output_dir: str,
max_workers: int = 4
) -> List[Tuple[str, bool]]:
"""Convert all text files in directory to PDF."""
input_path = Path(input_dir)
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# Find all text files
text_files = list(input_path.glob("**/*.txt"))
def convert_single(file_path: Path) -> Tuple[str, bool]:
"""Convert a single file."""
relative_path = file_path.relative_to(input_path)
output_file = output_path / relative_path.with_suffix('.pdf')
# Create output directory if needed
output_file.parent.mkdir(parents=True, exist_ok=True)
success = convert_file(
str(file_path),
str(output_file),
prompt="Professional document formatting"
)
return str(relative_path), success
# Process files in parallel
results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_file = {
executor.submit(convert_single, file_path): file_path
for file_path in text_files
}
for future in concurrent.futures.as_completed(future_to_file):
try:
result = future.result()
results.append(result)
print(f"{'✅' if result[1] else '❌'} {result[0]}")
except Exception as e:
file_path = future_to_file[future]
print(f"❌ {file_path}: {e}")
results.append((str(file_path), False))
return results
# Usage
results = batch_convert_documents("documents/", "converted_pdfs/")
successful = sum(1 for _, success in results if success)
print(f"Converted {successful}/{len(results)} files successfully")
Image Batch Processing
Resize and Convert Images
# Convert all JPEG images to WebP with compression
openconvert -i photos/ -o web_ready/ \\
--from image/jpeg --to image/webp \\
--prompt "Compress for web, resize to max 1920px width"
# Create thumbnails
openconvert -i photos/ -o thumbnails/ \\
--from image/jpeg --to image/jpeg \\
--prompt "Create 300x300 thumbnails"
Photo Library Processing
from openconvert import convert_file
from pathlib import Path
import logging
def process_photo_library(
input_dir: str,
output_dir: str,
create_thumbnails: bool = True,
web_format: bool = True
):
"""Process entire photo library with multiple output formats."""
input_path = Path(input_dir)
output_path = Path(output_dir)
# Supported image extensions
image_extensions = {'.jpg', '.jpeg', '.png', '.tiff', '.bmp'}
for image_file in input_path.rglob('*'):
if image_file.suffix.lower() in image_extensions:
relative_path = image_file.relative_to(input_path)
# Create directory structure
output_subdir = output_path / relative_path.parent
output_subdir.mkdir(parents=True, exist_ok=True)
# Web-optimized version
if web_format:
web_file = output_subdir / f"{relative_path.stem}_web.webp"
success = convert_file(
str(image_file),
str(web_file),
prompt="Optimize for web: compress and resize to max 1920px"
)
if success:
print(f"✅ Web: {relative_path}")
else:
print(f"❌ Web: {relative_path}")
# Thumbnail version
if create_thumbnails:
thumb_file = output_subdir / f"{relative_path.stem}_thumb.jpg"
success = convert_file(
str(image_file),
str(thumb_file),
prompt="Create 300x300 thumbnail, crop to center"
)
if success:
print(f"✅ Thumb: {relative_path}")
else:
print(f"❌ Thumb: {relative_path}")
# Usage
process_photo_library("raw_photos/", "processed_photos/")
Data Processing
CSV to Charts
# Convert all CSV files to chart PDFs
for csv_file in *.csv; do
openconvert -i "$csv_file" -o "${csv_file%.csv}_chart.pdf" \\
--prompt "Create bar charts and line graphs with data analysis"
done
Database Export Processing
from openconvert import convert_file
import pandas as pd
from pathlib import Path
def process_database_exports(export_dir: str, report_dir: str):
"""Convert database CSV exports to formatted reports."""
export_path = Path(export_dir)
report_path = Path(report_dir)
report_path.mkdir(parents=True, exist_ok=True)
# Define report templates for different data types
templates = {
'sales': "Create executive sales report with charts showing trends, top products, and regional performance",
'users': "Create user analytics report with demographics charts and activity graphs",
'inventory': "Create inventory report with stock levels, reorder alerts, and category breakdown",
'default': "Create professional data report with appropriate charts and analysis"
}
for csv_file in export_path.glob("*.csv"):
# Determine report type from filename
report_type = 'default'
for key in templates:
if key in csv_file.name.lower():
report_type = key
break
# Generate report
output_file = report_path / f"{csv_file.stem}_report.pdf"
success = convert_file(
str(csv_file),
str(output_file),
prompt=templates[report_type]
)
if success:
print(f"✅ Generated {report_type} report: {output_file.name}")
else:
print(f"❌ Failed to generate report for: {csv_file.name}")
# Usage
process_database_exports("exports/", "reports/")
Advanced Batch Patterns
Conditional Processing
from openconvert import convert_file
from pathlib import Path
import os
def smart_batch_convert(input_dir: str, output_dir: str):
"""Convert files only if output doesn't exist or is older."""
input_path = Path(input_dir)
output_path = Path(output_dir)
for input_file in input_path.rglob("*.txt"):
relative_path = input_file.relative_to(input_path)
output_file = output_path / relative_path.with_suffix('.pdf')
# Skip if output exists and is newer
if (output_file.exists() and
output_file.stat().st_mtime > input_file.stat().st_mtime):
print(f"⏭️ Skipping {relative_path} (up to date)")
continue
# Create output directory
output_file.parent.mkdir(parents=True, exist_ok=True)
# Convert file
success = convert_file(str(input_file), str(output_file))
if success:
print(f"✅ Converted {relative_path}")
else:
print(f"❌ Failed {relative_path}")
Error Recovery and Retry
from openconvert import convert_file
import time
import logging
from typing import List, Dict
def robust_batch_convert(
file_list: List[str],
output_dir: str,
max_retries: int = 3,
retry_delay: int = 5
) -> Dict[str, str]:
"""Batch convert with error recovery and retry logic."""
results = {}
failed_files = []
for input_file in file_list:
output_file = f"{output_dir}/{Path(input_file).stem}.pdf"
for attempt in range(max_retries + 1):
try:
success = convert_file(input_file, output_file)
if success:
results[input_file] = "success"
print(f"✅ {input_file} (attempt {attempt + 1})")
break
else:
if attempt < max_retries:
print(f"🔄 Retrying {input_file} in {retry_delay}s...")
time.sleep(retry_delay)
else:
results[input_file] = "failed_conversion"
failed_files.append(input_file)
print(f"❌ {input_file} failed after {max_retries + 1} attempts")
except Exception as e:
if attempt < max_retries:
print(f"🔄 Error with {input_file}, retrying: {e}")
time.sleep(retry_delay)
else:
results[input_file] = f"error: {e}"
failed_files.append(input_file)
print(f"❌ {input_file} error: {e}")
# Save failed files list for manual processing
if failed_files:
with open(f"{output_dir}/failed_conversions.txt", "w") as f:
f.write("\\n".join(failed_files))
return results
Progress Tracking
from openconvert import convert_file
from pathlib import Path
import time
from tqdm import tqdm # pip install tqdm
def batch_convert_with_progress(input_dir: str, output_dir: str):
"""Batch convert with progress bar and time estimation."""
input_path = Path(input_dir)
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# Get list of files to process
files_to_process = list(input_path.glob("*.txt"))
successful = 0
failed = 0
start_time = time.time()
# Process with progress bar
with tqdm(files_to_process, desc="Converting files") as pbar:
for input_file in pbar:
output_file = output_path / f"{input_file.stem}.pdf"
file_start = time.time()
success = convert_file(str(input_file), str(output_file))
file_time = time.time() - file_start
if success:
successful += 1
status = "✅"
else:
failed += 1
status = "❌"
# Update progress bar
pbar.set_postfix({
'Success': successful,
'Failed': failed,
'Time': f"{file_time:.1f}s"
})
# Final statistics
total_time = time.time() - start_time
print(f"\\nBatch conversion completed:")
print(f" Successful: {successful}")
print(f" Failed: {failed}")
print(f" Total time: {total_time:.1f}s")
print(f" Average per file: {total_time/len(files_to_process):.1f}s")
Configuration-Based Processing
import yaml
from openconvert import convert_file
from pathlib import Path
def config_based_batch_convert(config_file: str):
"""Batch convert based on YAML configuration."""
with open(config_file, 'r') as f:
config = yaml.safe_load(f)
for job in config.get('conversion_jobs', []):
input_pattern = job['input_pattern']
output_dir = job['output_dir']
target_format = job.get('target_format', 'application/pdf')
prompt = job.get('prompt', '')
# Create output directory
Path(output_dir).mkdir(parents=True, exist_ok=True)
# Process files matching pattern
for input_file in Path('.').glob(input_pattern):
output_file = Path(output_dir) / f"{input_file.stem}.pdf"
success = convert_file(
str(input_file),
str(output_file),
to_format=target_format,
prompt=prompt
)
print(f"{'✅' if success else '❌'} {input_file} -> {output_file}")
Example configuration file (batch_config.yaml):
conversion_jobs:
- name: "Documentation to PDF"
input_pattern: "docs/*.md"
output_dir: "pdfs/docs"
target_format: "application/pdf"
prompt: "Professional documentation format with TOC"
- name: "Reports to PDF"
input_pattern: "reports/*.txt"
output_dir: "pdfs/reports"
target_format: "application/pdf"
prompt: "Executive report format with charts"
- name: "Images to WebP"
input_pattern: "images/*.jpg"
output_dir: "web_images"
target_format: "image/webp"
prompt: "Optimize for web, maintain quality"
Usage:
python batch_processor.py batch_config.yaml
Monitoring and Logging
Production Batch Processing
import logging
import json
from datetime import datetime
from openconvert import convert_file
from pathlib import Path
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('batch_conversion.log'),
logging.StreamHandler()
]
)
def production_batch_convert(input_dir: str, output_dir: str):
"""Production-ready batch conversion with comprehensive logging."""
job_id = datetime.now().strftime("%Y%m%d_%H%M%S")
logging.info(f"Starting batch job {job_id}")
input_path = Path(input_dir)
output_path = Path(output_dir)
# Job statistics
stats = {
'job_id': job_id,
'start_time': datetime.now().isoformat(),
'total_files': 0,
'successful': 0,
'failed': 0,
'errors': []
}
try:
files_to_process = list(input_path.glob("*.txt"))
stats['total_files'] = len(files_to_process)
logging.info(f"Found {len(files_to_process)} files to process")
for input_file in files_to_process:
output_file = output_path / f"{input_file.stem}.pdf"
try:
success = convert_file(str(input_file), str(output_file))
if success:
stats['successful'] += 1
logging.info(f"✅ Converted: {input_file.name}")
else:
stats['failed'] += 1
error_msg = f"Conversion failed: {input_file.name}"
stats['errors'].append(error_msg)
logging.error(error_msg)
except Exception as e:
stats['failed'] += 1
error_msg = f"Exception processing {input_file.name}: {e}"
stats['errors'].append(error_msg)
logging.error(error_msg)
stats['end_time'] = datetime.now().isoformat()
stats['success_rate'] = stats['successful'] / stats['total_files'] * 100
# Save job report
report_file = output_path / f"batch_report_{job_id}.json"
with open(report_file, 'w') as f:
json.dump(stats, f, indent=2)
logging.info(f"Batch job completed: {stats['successful']}/{stats['total_files']} successful ({stats['success_rate']:.1f}%)")
except Exception as e:
logging.error(f"Batch job failed: {e}")
raise
# Usage
production_batch_convert("input_documents/", "output_pdfs/")
Best Practices
Start Small: Test with a few files before processing large batches
Monitor Resources: Watch CPU, memory, and disk usage during batch operations
Use Parallel Processing: But don’t overwhelm the network with too many concurrent requests
Implement Retry Logic: Network operations can fail temporarily
Log Everything: Essential for debugging and monitoring progress
Validate Results: Check that output files were created successfully
Handle Errors Gracefully: Don’t let one failed file stop the entire batch
Progress Tracking: Use progress bars for long-running operations
Configuration Files: Use config files for complex batch operations
Cleanup: Remove temporary files and handle disk space
See Also
Python API - Python API reference
CLI Reference - Command-line options
Python Integration Examples - More Python integration examples