added convert input file to utf-8 if needed

This commit is contained in:
2023-12-05 13:08:18 +01:00
parent 21e62e47d3
commit debb6606cf
3 changed files with 55 additions and 28 deletions

View File

@@ -31,7 +31,7 @@
{% if file_url %} {% if file_url %}
<h2 class="text-info font-semi-bold">Cleanup réussi</h2> <h2 class="text-info font-semi-bold">Cleanup réussi</h2>
Votre fichier nettoyé : <b>{{ file_name }}</b> <br><br> {{ libelle }} <b>{{ file_name }}</b> <br><br>
<a class="btn btn-primary" href="{{ file_url }}" download="{{ file_name }}"><span class="glyphicon glyphicon-arrow-down"></span> Télécharger</a> <a class="btn btn-primary" href="{{ file_url }}" download="{{ file_name }}"><span class="glyphicon glyphicon-arrow-down"></span> Télécharger</a>
{% endif %} {% endif %}

View File

@@ -7,6 +7,7 @@ import sys
import datetime import datetime
import time import time
import chardet import chardet
import html
@view_config(route_name='home', renderer='html_cleanup:templates/home.jinja2') @view_config(route_name='home', renderer='html_cleanup:templates/home.jinja2')
def home(request): def home(request):
@@ -14,77 +15,101 @@ def home(request):
message = '' message = ''
file_name = '' file_name = ''
file_url = '' file_url = ''
input_encoding = ''
libelle = ''
if 'form.submitted' in request.params: if 'form.submitted' in request.params:
if request.POST['uploadfile'] != b'': if request.POST['uploadfile'] != b'':
input_file = request.POST['uploadfile'].file input_file = request.POST['uploadfile'].file
input_name = request.POST['uploadfile'].filename input_name = request.POST['uploadfile'].filename
# récupère le fichier download, faire les controles et traiter # récupère le fichier download, faire les controles et traiter
message = process_file(request, input_file, input_name) message, input_encoding = process_file(request, input_file, input_name)
if message == '': if message == '':
file_name = "clean_" + input_name file_name = "clean_" + input_name
file_url = request.static_url('html_cleanup:static/temp/') + file_name file_url = request.static_url('html_cleanup:static/temp/') + file_name
if input_encoding == 'utf-8':
libelle = "Votre fichier est nettoyé : "
else:
libelle = "Votre fichier est converti et nettoyé : "
return { return {
'page_title': "HTML cleanup", 'page_title': "HTML cleanup",
'message': message, 'message': message,
'file_url': file_url, 'file_url': file_url,
'file_name': file_name, 'file_name': file_name,
'libelle': libelle,
} }
def process_file(request, input_file, input_name): def process_file(request, input_file, input_name):
# Check file mime type and size # check input_file validity
# and if OK, process file # if OK, procede the file cleanup
# else return an error message
message = '' message = ''
EXT_ALLOWED = ['text/html'] EXT_ALLOWED = ['text/html']
MAX_SIZE = 10 * (1024 ** 2) # 10 Mb MAX_SIZE = 10 * (1024 ** 2) # 10 Mb
temp_folder = request.registry.settings['temp_folder'] temp_folder = request.registry.settings['temp_folder']
logfile_name = os.path.join(temp_folder, 'errors_log') logfile_name = os.path.join(temp_folder, 'errors_log')
# --- controler le mime type # --- check the mime type
mime = magic.from_buffer(input_file.read(), mime=True) mime = magic.from_buffer(input_file.read(), mime=True)
# types de fichiers autorisés ?
if mime not in EXT_ALLOWED: if mime not in EXT_ALLOWED:
message = "Le format du fichier n'est pas valide. Téléchargement refusé." message = "Le format du fichier n'est pas valide. Téléchargement refusé."
add_error2log(logfile_name, input_name, message) add_error2log(logfile_name, input_name, message)
return message return message
# lire la taille du fichier # get input_file size
input_file.seek(0, 2) #seek to end input_file.seek(0, 2) #seek to end
filesize = input_file.tell() filesize = input_file.tell()
input_file.seek(0) # back to begining position input_file.seek(0) # back to begining position
# --- controler la taille du fichier # --- check input_file size
if filesize > MAX_SIZE: if filesize > MAX_SIZE:
message = "La taille du fichier dépasse la limite autorisée. Téléchargement refusé." message = "La taille du fichier dépasse la limite autorisée. Téléchargement refusé."
add_error2log(logfile_name, input_name, message) add_error2log(logfile_name, input_name, message)
return message return message
# --- controler l'encodage UTF-8 # Finally write the data to a temporary file
try: input_filename = os.path.join(temp_folder, input_name)
data = input_file.read().decode('utf-8') # supprimer le fichier s'il existe déjà
except Exception as e: if os.path.exists(input_filename):
message = "L'encodage fichier n'est pas valide (non utf-8). Téléchargement refusé." os.remove(input_filename)
add_error2log(logfile_name, input_name, message)
return message
# import pdb;pdb.set_trace() # copie le fichier upload dans temp_file
# --- controle presence tag <body> input_file.seek(0)
if data.find('<body') == -1 or data.find('</body>') == -1: with open(input_filename, 'wb') as output_file:
shutil.copyfileobj(input_file, output_file)
# detect text coding and transform to utf-8 if needed
with open(input_filename, 'rb') as f:
raw_bytes = f.read()
detected = chardet.detect(raw_bytes)
input_encoding = detected['encoding'].lower()
if input_encoding != 'utf-8':
with open(input_filename, 'r', encoding=input_encoding) as f:
input_buffer = f.read()
# --- check if input_file contains tag <body>
if input_buffer.find('<body') == -1 or input_buffer.find('</body>') == -1:
message = "Le format du fichier n'est pas valide (absence de tag <body> ou </body>). Téléchargement refusé." message = "Le format du fichier n'est pas valide (absence de tag <body> ou </body>). Téléchargement refusé."
add_error2log(logfile_name, input_name, message) add_error2log(logfile_name, input_name, message)
return message return message
input_file.seek(0) # back to begining position # --- convert input_file encoding to utf-8
# controle OK, traiter le fichier with open(input_filename, 'w', encoding='utf-8') as f:
message = "Le fichier est converti de " + input_encoding + " en utf-8."
add_error2log(logfile_name, input_name, message)
f.write(input_buffer)
# controls OK, cleanup input_file
output_name = "clean_" + input_name output_name = "clean_" + input_name
output_file = os.path.join(temp_folder, output_name) output_file = os.path.join(temp_folder, output_name)
message = clean_html(input_file, output_file) message = clean_html(input_filename, output_file)
if message: if message:
add_error2log(logfile_name, input_name, message) add_error2log(logfile_name, input_name, message)
else: else:
add_error2log(logfile_name, input_name, "*** Cleanup réussi. ***") add_error2log(logfile_name, input_name, "*** Cleanup réussi. ***")
return message return message, input_encoding
def clean_html(input_file, output_file): def clean_html(input_file, output_file):
# cleanup undesirable tags in html file # cleanup undesirable tags in html file
@@ -92,15 +117,15 @@ def clean_html(input_file, output_file):
message = "" message = ""
try: try:
with open(output_file, 'w') as fo: with open(input_file, 'r') as fi, open(output_file, 'w') as fo:
fo.write("<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset={}\">\n<title>untitled</title>\n</head>\n".format(encoding)) fo.write("<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset={}\">\n<title>untitled</title>\n</head>\n".format(encoding))
body = False body = False
skip_tag = False skip_tag = False
nb_lines = 0 nb_lines = 0
for line_in_bytes in input_file: for line_in in fi:
line_in = line_in_bytes.decode('utf-8') line_in = html.unescape(line_in)
line_out = "" line_out = ""
if not body: if not body:

View File

@@ -14,6 +14,8 @@ requires = [
'pyramid_jinja2', 'pyramid_jinja2',
'pyramid_debugtoolbar', 'pyramid_debugtoolbar',
'python-magic', 'python-magic',
'html',
'waitress', 'waitress',
] ]