diff --git a/html_cleanup/templates/home.jinja2 b/html_cleanup/templates/home.jinja2
index 76c158a..1ff9a12 100644
--- a/html_cleanup/templates/home.jinja2
+++ b/html_cleanup/templates/home.jinja2
@@ -31,7 +31,7 @@
{% if file_url %}
Cleanup réussi
- Votre fichier nettoyé : {{ file_name }}
+ {{ libelle }} {{ file_name }}
Télécharger
{% endif %}
diff --git a/html_cleanup/views/default.py b/html_cleanup/views/default.py
index 717aeb0..ff588b1 100644
--- a/html_cleanup/views/default.py
+++ b/html_cleanup/views/default.py
@@ -7,6 +7,7 @@ import sys
import datetime
import time
import chardet
+import html
@view_config(route_name='home', renderer='html_cleanup:templates/home.jinja2')
def home(request):
@@ -14,77 +15,101 @@ def home(request):
message = ''
file_name = ''
file_url = ''
+ input_encoding = ''
+ libelle = ''
if 'form.submitted' in request.params:
if request.POST['uploadfile'] != b'':
input_file = request.POST['uploadfile'].file
input_name = request.POST['uploadfile'].filename
# récupère le fichier download, faire les controles et traiter
- message = process_file(request, input_file, input_name)
+ message, input_encoding = process_file(request, input_file, input_name)
if message == '':
file_name = "clean_" + input_name
file_url = request.static_url('html_cleanup:static/temp/') + file_name
+ if input_encoding == 'utf-8':
+ libelle = "Votre fichier est nettoyé : "
+ else:
+ libelle = "Votre fichier est converti et nettoyé : "
return {
'page_title': "HTML cleanup",
'message': message,
'file_url': file_url,
'file_name': file_name,
+ 'libelle': libelle,
}
def process_file(request, input_file, input_name):
- # Check file mime type and size
- # and if OK, process file
+ # check input_file validity
+ # if OK, procede the file cleanup
+ # else return an error message
+
message = ''
EXT_ALLOWED = ['text/html']
MAX_SIZE = 10 * (1024 ** 2) # 10 Mb
temp_folder = request.registry.settings['temp_folder']
logfile_name = os.path.join(temp_folder, 'errors_log')
- # --- controler le mime type
+ # --- check the mime type
mime = magic.from_buffer(input_file.read(), mime=True)
- # types de fichiers autorisés ?
if mime not in EXT_ALLOWED:
message = "Le format du fichier n'est pas valide. Téléchargement refusé."
add_error2log(logfile_name, input_name, message)
return message
- # lire la taille du fichier
+ # get input_file size
input_file.seek(0, 2) #seek to end
filesize = input_file.tell()
input_file.seek(0) # back to begining position
- # --- controler la taille du fichier
+ # --- check input_file size
if filesize > MAX_SIZE:
message = "La taille du fichier dépasse la limite autorisée. Téléchargement refusé."
add_error2log(logfile_name, input_name, message)
return message
- # --- controler l'encodage UTF-8
- try:
- data = input_file.read().decode('utf-8')
- except Exception as e:
- message = "L'encodage fichier n'est pas valide (non utf-8). Téléchargement refusé."
- add_error2log(logfile_name, input_name, message)
- return message
+ # Finally write the data to a temporary file
+ input_filename = os.path.join(temp_folder, input_name)
+ # supprimer le fichier s'il existe déjà
+ if os.path.exists(input_filename):
+ os.remove(input_filename)
- # import pdb;pdb.set_trace()
- # --- controle presence tag
- if data.find('') == -1:
- message = "Le format du fichier n'est pas valide (absence de tag ou ). Téléchargement refusé."
- add_error2log(logfile_name, input_name, message)
- return message
+ # copie le fichier upload dans temp_file
+ input_file.seek(0)
+ with open(input_filename, 'wb') as output_file:
+ shutil.copyfileobj(input_file, output_file)
- input_file.seek(0) # back to begining position
- # controle OK, traiter le fichier
+ # detect text coding and transform to utf-8 if needed
+ with open(input_filename, 'rb') as f:
+ raw_bytes = f.read()
+ detected = chardet.detect(raw_bytes)
+ input_encoding = detected['encoding'].lower()
+ if input_encoding != 'utf-8':
+ with open(input_filename, 'r', encoding=input_encoding) as f:
+ input_buffer = f.read()
+ # --- check if input_file contains tag
+ if input_buffer.find('') == -1:
+ message = "Le format du fichier n'est pas valide (absence de tag ou ). Téléchargement refusé."
+ add_error2log(logfile_name, input_name, message)
+ return message
+
+ # --- convert input_file encoding to utf-8
+ with open(input_filename, 'w', encoding='utf-8') as f:
+ message = "Le fichier est converti de " + input_encoding + " en utf-8."
+ add_error2log(logfile_name, input_name, message)
+ f.write(input_buffer)
+
+
+ # controls OK, cleanup input_file
output_name = "clean_" + input_name
output_file = os.path.join(temp_folder, output_name)
- message = clean_html(input_file, output_file)
+ message = clean_html(input_filename, output_file)
if message:
add_error2log(logfile_name, input_name, message)
else:
add_error2log(logfile_name, input_name, "*** Cleanup réussi. ***")
- return message
+ return message, input_encoding
def clean_html(input_file, output_file):
# cleanup undesirable tags in html file
@@ -92,15 +117,15 @@ def clean_html(input_file, output_file):
message = ""
try:
- with open(output_file, 'w') as fo:
+ with open(input_file, 'r') as fi, open(output_file, 'w') as fo:
fo.write("\n\n\nuntitled\n\n".format(encoding))
body = False
skip_tag = False
nb_lines = 0
- for line_in_bytes in input_file:
- line_in = line_in_bytes.decode('utf-8')
+ for line_in in fi:
+ line_in = html.unescape(line_in)
line_out = ""
if not body:
diff --git a/setup.py b/setup.py
index a4b939c..60a2c27 100644
--- a/setup.py
+++ b/setup.py
@@ -14,6 +14,8 @@ requires = [
'pyramid_jinja2',
'pyramid_debugtoolbar',
'python-magic',
+ 'html',
+
'waitress',
]