added convert input file to utf-8 if needed
This commit is contained in:
@@ -31,7 +31,7 @@
|
||||
|
||||
{% if file_url %}
|
||||
<h2 class="text-info font-semi-bold">Cleanup réussi</h2>
|
||||
Votre fichier nettoyé : <b>{{ file_name }}</b> <br><br>
|
||||
{{ libelle }} <b>{{ file_name }}</b> <br><br>
|
||||
<a class="btn btn-primary" href="{{ file_url }}" download="{{ file_name }}"><span class="glyphicon glyphicon-arrow-down"></span> Télécharger</a>
|
||||
{% endif %}
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@ import sys
|
||||
import datetime
|
||||
import time
|
||||
import chardet
|
||||
import html
|
||||
|
||||
@view_config(route_name='home', renderer='html_cleanup:templates/home.jinja2')
|
||||
def home(request):
|
||||
@@ -14,77 +15,101 @@ def home(request):
|
||||
message = ''
|
||||
file_name = ''
|
||||
file_url = ''
|
||||
input_encoding = ''
|
||||
libelle = ''
|
||||
|
||||
if 'form.submitted' in request.params:
|
||||
if request.POST['uploadfile'] != b'':
|
||||
input_file = request.POST['uploadfile'].file
|
||||
input_name = request.POST['uploadfile'].filename
|
||||
# récupère le fichier download, faire les controles et traiter
|
||||
message = process_file(request, input_file, input_name)
|
||||
message, input_encoding = process_file(request, input_file, input_name)
|
||||
if message == '':
|
||||
file_name = "clean_" + input_name
|
||||
file_url = request.static_url('html_cleanup:static/temp/') + file_name
|
||||
if input_encoding == 'utf-8':
|
||||
libelle = "Votre fichier est nettoyé : "
|
||||
else:
|
||||
libelle = "Votre fichier est converti et nettoyé : "
|
||||
|
||||
return {
|
||||
'page_title': "HTML cleanup",
|
||||
'message': message,
|
||||
'file_url': file_url,
|
||||
'file_name': file_name,
|
||||
'libelle': libelle,
|
||||
}
|
||||
|
||||
def process_file(request, input_file, input_name):
|
||||
# Check file mime type and size
|
||||
# and if OK, process file
|
||||
# check input_file validity
|
||||
# if OK, procede the file cleanup
|
||||
# else return an error message
|
||||
|
||||
message = ''
|
||||
EXT_ALLOWED = ['text/html']
|
||||
MAX_SIZE = 10 * (1024 ** 2) # 10 Mb
|
||||
temp_folder = request.registry.settings['temp_folder']
|
||||
logfile_name = os.path.join(temp_folder, 'errors_log')
|
||||
|
||||
# --- controler le mime type
|
||||
# --- check the mime type
|
||||
mime = magic.from_buffer(input_file.read(), mime=True)
|
||||
# types de fichiers autorisés ?
|
||||
if mime not in EXT_ALLOWED:
|
||||
message = "Le format du fichier n'est pas valide. Téléchargement refusé."
|
||||
add_error2log(logfile_name, input_name, message)
|
||||
return message
|
||||
|
||||
# lire la taille du fichier
|
||||
# get input_file size
|
||||
input_file.seek(0, 2) #seek to end
|
||||
filesize = input_file.tell()
|
||||
input_file.seek(0) # back to begining position
|
||||
# --- controler la taille du fichier
|
||||
# --- check input_file size
|
||||
if filesize > MAX_SIZE:
|
||||
message = "La taille du fichier dépasse la limite autorisée. Téléchargement refusé."
|
||||
add_error2log(logfile_name, input_name, message)
|
||||
return message
|
||||
|
||||
# --- controler l'encodage UTF-8
|
||||
try:
|
||||
data = input_file.read().decode('utf-8')
|
||||
except Exception as e:
|
||||
message = "L'encodage fichier n'est pas valide (non utf-8). Téléchargement refusé."
|
||||
add_error2log(logfile_name, input_name, message)
|
||||
return message
|
||||
# Finally write the data to a temporary file
|
||||
input_filename = os.path.join(temp_folder, input_name)
|
||||
# supprimer le fichier s'il existe déjà
|
||||
if os.path.exists(input_filename):
|
||||
os.remove(input_filename)
|
||||
|
||||
# import pdb;pdb.set_trace()
|
||||
# --- controle presence tag <body>
|
||||
if data.find('<body') == -1 or data.find('</body>') == -1:
|
||||
# copie le fichier upload dans temp_file
|
||||
input_file.seek(0)
|
||||
with open(input_filename, 'wb') as output_file:
|
||||
shutil.copyfileobj(input_file, output_file)
|
||||
|
||||
# detect text coding and transform to utf-8 if needed
|
||||
with open(input_filename, 'rb') as f:
|
||||
raw_bytes = f.read()
|
||||
detected = chardet.detect(raw_bytes)
|
||||
input_encoding = detected['encoding'].lower()
|
||||
if input_encoding != 'utf-8':
|
||||
with open(input_filename, 'r', encoding=input_encoding) as f:
|
||||
input_buffer = f.read()
|
||||
# --- check if input_file contains tag <body>
|
||||
if input_buffer.find('<body') == -1 or input_buffer.find('</body>') == -1:
|
||||
message = "Le format du fichier n'est pas valide (absence de tag <body> ou </body>). Téléchargement refusé."
|
||||
add_error2log(logfile_name, input_name, message)
|
||||
return message
|
||||
|
||||
input_file.seek(0) # back to begining position
|
||||
# controle OK, traiter le fichier
|
||||
# --- convert input_file encoding to utf-8
|
||||
with open(input_filename, 'w', encoding='utf-8') as f:
|
||||
message = "Le fichier est converti de " + input_encoding + " en utf-8."
|
||||
add_error2log(logfile_name, input_name, message)
|
||||
f.write(input_buffer)
|
||||
|
||||
|
||||
# controls OK, cleanup input_file
|
||||
output_name = "clean_" + input_name
|
||||
output_file = os.path.join(temp_folder, output_name)
|
||||
message = clean_html(input_file, output_file)
|
||||
message = clean_html(input_filename, output_file)
|
||||
if message:
|
||||
add_error2log(logfile_name, input_name, message)
|
||||
else:
|
||||
add_error2log(logfile_name, input_name, "*** Cleanup réussi. ***")
|
||||
|
||||
return message
|
||||
return message, input_encoding
|
||||
|
||||
def clean_html(input_file, output_file):
|
||||
# cleanup undesirable tags in html file
|
||||
@@ -92,15 +117,15 @@ def clean_html(input_file, output_file):
|
||||
message = ""
|
||||
|
||||
try:
|
||||
with open(output_file, 'w') as fo:
|
||||
with open(input_file, 'r') as fi, open(output_file, 'w') as fo:
|
||||
fo.write("<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset={}\">\n<title>untitled</title>\n</head>\n".format(encoding))
|
||||
|
||||
body = False
|
||||
skip_tag = False
|
||||
nb_lines = 0
|
||||
|
||||
for line_in_bytes in input_file:
|
||||
line_in = line_in_bytes.decode('utf-8')
|
||||
for line_in in fi:
|
||||
line_in = html.unescape(line_in)
|
||||
line_out = ""
|
||||
|
||||
if not body:
|
||||
|
||||
Reference in New Issue
Block a user