diff --git a/html_cleanup/templates/home.jinja2 b/html_cleanup/templates/home.jinja2 index 76c158a..1ff9a12 100644 --- a/html_cleanup/templates/home.jinja2 +++ b/html_cleanup/templates/home.jinja2 @@ -31,7 +31,7 @@ {% if file_url %}

Cleanup réussi

- Votre fichier nettoyé : {{ file_name }}

+ {{ libelle }} {{ file_name }}

Télécharger {% endif %} diff --git a/html_cleanup/views/default.py b/html_cleanup/views/default.py index 717aeb0..ff588b1 100644 --- a/html_cleanup/views/default.py +++ b/html_cleanup/views/default.py @@ -7,6 +7,7 @@ import sys import datetime import time import chardet +import html @view_config(route_name='home', renderer='html_cleanup:templates/home.jinja2') def home(request): @@ -14,77 +15,101 @@ def home(request): message = '' file_name = '' file_url = '' + input_encoding = '' + libelle = '' if 'form.submitted' in request.params: if request.POST['uploadfile'] != b'': input_file = request.POST['uploadfile'].file input_name = request.POST['uploadfile'].filename # récupère le fichier download, faire les controles et traiter - message = process_file(request, input_file, input_name) + message, input_encoding = process_file(request, input_file, input_name) if message == '': file_name = "clean_" + input_name file_url = request.static_url('html_cleanup:static/temp/') + file_name + if input_encoding == 'utf-8': + libelle = "Votre fichier est nettoyé : " + else: + libelle = "Votre fichier est converti et nettoyé : " return { 'page_title': "HTML cleanup", 'message': message, 'file_url': file_url, 'file_name': file_name, + 'libelle': libelle, } def process_file(request, input_file, input_name): - # Check file mime type and size - # and if OK, process file + # check input_file validity + # if OK, procede the file cleanup + # else return an error message + message = '' EXT_ALLOWED = ['text/html'] MAX_SIZE = 10 * (1024 ** 2) # 10 Mb temp_folder = request.registry.settings['temp_folder'] logfile_name = os.path.join(temp_folder, 'errors_log') - # --- controler le mime type + # --- check the mime type mime = magic.from_buffer(input_file.read(), mime=True) - # types de fichiers autorisés ? if mime not in EXT_ALLOWED: message = "Le format du fichier n'est pas valide. Téléchargement refusé." add_error2log(logfile_name, input_name, message) return message - # lire la taille du fichier + # get input_file size input_file.seek(0, 2) #seek to end filesize = input_file.tell() input_file.seek(0) # back to begining position - # --- controler la taille du fichier + # --- check input_file size if filesize > MAX_SIZE: message = "La taille du fichier dépasse la limite autorisée. Téléchargement refusé." add_error2log(logfile_name, input_name, message) return message - # --- controler l'encodage UTF-8 - try: - data = input_file.read().decode('utf-8') - except Exception as e: - message = "L'encodage fichier n'est pas valide (non utf-8). Téléchargement refusé." - add_error2log(logfile_name, input_name, message) - return message + # Finally write the data to a temporary file + input_filename = os.path.join(temp_folder, input_name) + # supprimer le fichier s'il existe déjà + if os.path.exists(input_filename): + os.remove(input_filename) - # import pdb;pdb.set_trace() - # --- controle presence tag - if data.find('') == -1: - message = "Le format du fichier n'est pas valide (absence de tag ou ). Téléchargement refusé." - add_error2log(logfile_name, input_name, message) - return message + # copie le fichier upload dans temp_file + input_file.seek(0) + with open(input_filename, 'wb') as output_file: + shutil.copyfileobj(input_file, output_file) - input_file.seek(0) # back to begining position - # controle OK, traiter le fichier + # detect text coding and transform to utf-8 if needed + with open(input_filename, 'rb') as f: + raw_bytes = f.read() + detected = chardet.detect(raw_bytes) + input_encoding = detected['encoding'].lower() + if input_encoding != 'utf-8': + with open(input_filename, 'r', encoding=input_encoding) as f: + input_buffer = f.read() + # --- check if input_file contains tag + if input_buffer.find('') == -1: + message = "Le format du fichier n'est pas valide (absence de tag ou ). Téléchargement refusé." + add_error2log(logfile_name, input_name, message) + return message + + # --- convert input_file encoding to utf-8 + with open(input_filename, 'w', encoding='utf-8') as f: + message = "Le fichier est converti de " + input_encoding + " en utf-8." + add_error2log(logfile_name, input_name, message) + f.write(input_buffer) + + + # controls OK, cleanup input_file output_name = "clean_" + input_name output_file = os.path.join(temp_folder, output_name) - message = clean_html(input_file, output_file) + message = clean_html(input_filename, output_file) if message: add_error2log(logfile_name, input_name, message) else: add_error2log(logfile_name, input_name, "*** Cleanup réussi. ***") - return message + return message, input_encoding def clean_html(input_file, output_file): # cleanup undesirable tags in html file @@ -92,15 +117,15 @@ def clean_html(input_file, output_file): message = "" try: - with open(output_file, 'w') as fo: + with open(input_file, 'r') as fi, open(output_file, 'w') as fo: fo.write("\n\n\nuntitled\n\n".format(encoding)) body = False skip_tag = False nb_lines = 0 - for line_in_bytes in input_file: - line_in = line_in_bytes.decode('utf-8') + for line_in in fi: + line_in = html.unescape(line_in) line_out = "" if not body: diff --git a/setup.py b/setup.py index a4b939c..60a2c27 100644 --- a/setup.py +++ b/setup.py @@ -14,6 +14,8 @@ requires = [ 'pyramid_jinja2', 'pyramid_debugtoolbar', 'python-magic', + 'html', + 'waitress', ]