-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
153 lines (124 loc) · 4.86 KB
/
app.py
File metadata and controls
153 lines (124 loc) · 4.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import os
import logging
from flask import Flask, render_template, request, jsonify, send_file
from werkzeug.utils import secure_filename
import pytesseract
from pdf2image import convert_from_bytes
from googletrans import Translator, LANGUAGES
from PyPDF2 import PdfReader, PdfWriter
import io
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.platypus import Paragraph, SimpleDocTemplate, Spacer
import asyncio
# Configure logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
app = Flask(__name__)
app.secret_key = os.environ.get("SESSION_SECRET", "default-secret-key")
# Configure upload folder
UPLOAD_FOLDER = '/tmp'
ALLOWED_EXTENSIONS = {'pdf'}
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max file size
def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
@app.route('/')
def index():
return render_template('index.html')
@app.route('/upload', methods=['POST'])
def upload_file():
try:
if 'file' not in request.files:
return jsonify({'error': 'No file part'}), 400
file = request.files['file']
target_lang = request.form.get('language', 'en')
# Validate target language
if target_lang not in LANGUAGES:
return jsonify({'error': f'Unsupported target language: {target_lang}'}), 400
if file.filename == '':
return jsonify({'error': 'No selected file'}), 400
if not allowed_file(file.filename):
return jsonify({'error': 'Invalid file type'}), 400
# Read PDF content
pdf_bytes = file.read()
logger.debug(f"Processing PDF for translation to {target_lang}")
# Convert PDF to images
images = convert_from_bytes(pdf_bytes)
# Extract text using OCR for each page
pages_text = []
for image in images:
text = pytesseract.image_to_string(image)
pages_text.append(text)
logger.debug(f"Extracted text from {len(pages_text)} pages")
# Create translator instance
translator = Translator()
# Create event loop for async translation
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
# Translate each page
translated_pages = []
try:
for page_text in pages_text:
# Split text into smaller chunks if it's too long
max_chunk_size = 5000
text_chunks = [page_text[i:i+max_chunk_size]
for i in range(0, len(page_text), max_chunk_size)]
translated_chunks = []
for chunk in text_chunks:
translation = loop.run_until_complete(
translator.translate(chunk, dest=target_lang))
translated_chunks.append(translation.text)
translated_pages.append(' '.join(translated_chunks))
logger.debug(f"Translated {len(translated_pages)} pages")
except Exception as translation_error:
logger.error(f"Translation error: {str(translation_error)}")
return jsonify({'error': f'Translation failed: {str(translation_error)}'}), 500
finally:
loop.close()
# Create output PDF with translated text
output_buffer = io.BytesIO()
doc = SimpleDocTemplate(
output_buffer,
pagesize=letter,
rightMargin=72,
leftMargin=72,
topMargin=72,
bottomMargin=72
)
# Create styles
styles = getSampleStyleSheet()
normal_style = ParagraphStyle(
'CustomNormal',
parent=styles['Normal'],
fontSize=11,
leading=14,
spaceBefore=12,
spaceAfter=12
)
# Create the PDF content
pdf_content = []
for translated_text in translated_pages:
# Split text into paragraphs
paragraphs = translated_text.split('\n\n')
for para in paragraphs:
if para.strip():
p = Paragraph(para.replace('\n', '<br/>'), normal_style)
pdf_content.append(p)
pdf_content.append(Spacer(1, 12))
# Build the PDF
doc.build(pdf_content)
output_buffer.seek(0)
return send_file(
output_buffer,
as_attachment=True,
download_name='translated.pdf',
mimetype='application/pdf'
)
except Exception as e:
logger.error(f"Error processing file: {str(e)}")
return jsonify({'error': str(e)}), 500
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, debug=True)