added convert input file to utf-8 if needed
This commit is contained in:
@@ -31,7 +31,7 @@
|
|||||||
|
|
||||||
{% if file_url %}
|
{% if file_url %}
|
||||||
<h2 class="text-info font-semi-bold">Cleanup réussi</h2>
|
<h2 class="text-info font-semi-bold">Cleanup réussi</h2>
|
||||||
Votre fichier nettoyé : <b>{{ file_name }}</b> <br><br>
|
{{ libelle }} <b>{{ file_name }}</b> <br><br>
|
||||||
<a class="btn btn-primary" href="{{ file_url }}" download="{{ file_name }}"><span class="glyphicon glyphicon-arrow-down"></span> Télécharger</a>
|
<a class="btn btn-primary" href="{{ file_url }}" download="{{ file_name }}"><span class="glyphicon glyphicon-arrow-down"></span> Télécharger</a>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ import sys
|
|||||||
import datetime
|
import datetime
|
||||||
import time
|
import time
|
||||||
import chardet
|
import chardet
|
||||||
|
import html
|
||||||
|
|
||||||
@view_config(route_name='home', renderer='html_cleanup:templates/home.jinja2')
|
@view_config(route_name='home', renderer='html_cleanup:templates/home.jinja2')
|
||||||
def home(request):
|
def home(request):
|
||||||
@@ -14,77 +15,101 @@ def home(request):
|
|||||||
message = ''
|
message = ''
|
||||||
file_name = ''
|
file_name = ''
|
||||||
file_url = ''
|
file_url = ''
|
||||||
|
input_encoding = ''
|
||||||
|
libelle = ''
|
||||||
|
|
||||||
if 'form.submitted' in request.params:
|
if 'form.submitted' in request.params:
|
||||||
if request.POST['uploadfile'] != b'':
|
if request.POST['uploadfile'] != b'':
|
||||||
input_file = request.POST['uploadfile'].file
|
input_file = request.POST['uploadfile'].file
|
||||||
input_name = request.POST['uploadfile'].filename
|
input_name = request.POST['uploadfile'].filename
|
||||||
# récupère le fichier download, faire les controles et traiter
|
# récupère le fichier download, faire les controles et traiter
|
||||||
message = process_file(request, input_file, input_name)
|
message, input_encoding = process_file(request, input_file, input_name)
|
||||||
if message == '':
|
if message == '':
|
||||||
file_name = "clean_" + input_name
|
file_name = "clean_" + input_name
|
||||||
file_url = request.static_url('html_cleanup:static/temp/') + file_name
|
file_url = request.static_url('html_cleanup:static/temp/') + file_name
|
||||||
|
if input_encoding == 'utf-8':
|
||||||
|
libelle = "Votre fichier est nettoyé : "
|
||||||
|
else:
|
||||||
|
libelle = "Votre fichier est converti et nettoyé : "
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'page_title': "HTML cleanup",
|
'page_title': "HTML cleanup",
|
||||||
'message': message,
|
'message': message,
|
||||||
'file_url': file_url,
|
'file_url': file_url,
|
||||||
'file_name': file_name,
|
'file_name': file_name,
|
||||||
|
'libelle': libelle,
|
||||||
}
|
}
|
||||||
|
|
||||||
def process_file(request, input_file, input_name):
|
def process_file(request, input_file, input_name):
|
||||||
# Check file mime type and size
|
# check input_file validity
|
||||||
# and if OK, process file
|
# if OK, procede the file cleanup
|
||||||
|
# else return an error message
|
||||||
|
|
||||||
message = ''
|
message = ''
|
||||||
EXT_ALLOWED = ['text/html']
|
EXT_ALLOWED = ['text/html']
|
||||||
MAX_SIZE = 10 * (1024 ** 2) # 10 Mb
|
MAX_SIZE = 10 * (1024 ** 2) # 10 Mb
|
||||||
temp_folder = request.registry.settings['temp_folder']
|
temp_folder = request.registry.settings['temp_folder']
|
||||||
logfile_name = os.path.join(temp_folder, 'errors_log')
|
logfile_name = os.path.join(temp_folder, 'errors_log')
|
||||||
|
|
||||||
# --- controler le mime type
|
# --- check the mime type
|
||||||
mime = magic.from_buffer(input_file.read(), mime=True)
|
mime = magic.from_buffer(input_file.read(), mime=True)
|
||||||
# types de fichiers autorisés ?
|
|
||||||
if mime not in EXT_ALLOWED:
|
if mime not in EXT_ALLOWED:
|
||||||
message = "Le format du fichier n'est pas valide. Téléchargement refusé."
|
message = "Le format du fichier n'est pas valide. Téléchargement refusé."
|
||||||
add_error2log(logfile_name, input_name, message)
|
add_error2log(logfile_name, input_name, message)
|
||||||
return message
|
return message
|
||||||
|
|
||||||
# lire la taille du fichier
|
# get input_file size
|
||||||
input_file.seek(0, 2) #seek to end
|
input_file.seek(0, 2) #seek to end
|
||||||
filesize = input_file.tell()
|
filesize = input_file.tell()
|
||||||
input_file.seek(0) # back to begining position
|
input_file.seek(0) # back to begining position
|
||||||
# --- controler la taille du fichier
|
# --- check input_file size
|
||||||
if filesize > MAX_SIZE:
|
if filesize > MAX_SIZE:
|
||||||
message = "La taille du fichier dépasse la limite autorisée. Téléchargement refusé."
|
message = "La taille du fichier dépasse la limite autorisée. Téléchargement refusé."
|
||||||
add_error2log(logfile_name, input_name, message)
|
add_error2log(logfile_name, input_name, message)
|
||||||
return message
|
return message
|
||||||
|
|
||||||
# --- controler l'encodage UTF-8
|
# Finally write the data to a temporary file
|
||||||
try:
|
input_filename = os.path.join(temp_folder, input_name)
|
||||||
data = input_file.read().decode('utf-8')
|
# supprimer le fichier s'il existe déjà
|
||||||
except Exception as e:
|
if os.path.exists(input_filename):
|
||||||
message = "L'encodage fichier n'est pas valide (non utf-8). Téléchargement refusé."
|
os.remove(input_filename)
|
||||||
add_error2log(logfile_name, input_name, message)
|
|
||||||
return message
|
|
||||||
|
|
||||||
# import pdb;pdb.set_trace()
|
# copie le fichier upload dans temp_file
|
||||||
# --- controle presence tag <body>
|
input_file.seek(0)
|
||||||
if data.find('<body') == -1 or data.find('</body>') == -1:
|
with open(input_filename, 'wb') as output_file:
|
||||||
message = "Le format du fichier n'est pas valide (absence de tag <body> ou </body>). Téléchargement refusé."
|
shutil.copyfileobj(input_file, output_file)
|
||||||
add_error2log(logfile_name, input_name, message)
|
|
||||||
return message
|
|
||||||
|
|
||||||
input_file.seek(0) # back to begining position
|
# detect text coding and transform to utf-8 if needed
|
||||||
# controle OK, traiter le fichier
|
with open(input_filename, 'rb') as f:
|
||||||
|
raw_bytes = f.read()
|
||||||
|
detected = chardet.detect(raw_bytes)
|
||||||
|
input_encoding = detected['encoding'].lower()
|
||||||
|
if input_encoding != 'utf-8':
|
||||||
|
with open(input_filename, 'r', encoding=input_encoding) as f:
|
||||||
|
input_buffer = f.read()
|
||||||
|
# --- check if input_file contains tag <body>
|
||||||
|
if input_buffer.find('<body') == -1 or input_buffer.find('</body>') == -1:
|
||||||
|
message = "Le format du fichier n'est pas valide (absence de tag <body> ou </body>). Téléchargement refusé."
|
||||||
|
add_error2log(logfile_name, input_name, message)
|
||||||
|
return message
|
||||||
|
|
||||||
|
# --- convert input_file encoding to utf-8
|
||||||
|
with open(input_filename, 'w', encoding='utf-8') as f:
|
||||||
|
message = "Le fichier est converti de " + input_encoding + " en utf-8."
|
||||||
|
add_error2log(logfile_name, input_name, message)
|
||||||
|
f.write(input_buffer)
|
||||||
|
|
||||||
|
|
||||||
|
# controls OK, cleanup input_file
|
||||||
output_name = "clean_" + input_name
|
output_name = "clean_" + input_name
|
||||||
output_file = os.path.join(temp_folder, output_name)
|
output_file = os.path.join(temp_folder, output_name)
|
||||||
message = clean_html(input_file, output_file)
|
message = clean_html(input_filename, output_file)
|
||||||
if message:
|
if message:
|
||||||
add_error2log(logfile_name, input_name, message)
|
add_error2log(logfile_name, input_name, message)
|
||||||
else:
|
else:
|
||||||
add_error2log(logfile_name, input_name, "*** Cleanup réussi. ***")
|
add_error2log(logfile_name, input_name, "*** Cleanup réussi. ***")
|
||||||
|
|
||||||
return message
|
return message, input_encoding
|
||||||
|
|
||||||
def clean_html(input_file, output_file):
|
def clean_html(input_file, output_file):
|
||||||
# cleanup undesirable tags in html file
|
# cleanup undesirable tags in html file
|
||||||
@@ -92,15 +117,15 @@ def clean_html(input_file, output_file):
|
|||||||
message = ""
|
message = ""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(output_file, 'w') as fo:
|
with open(input_file, 'r') as fi, open(output_file, 'w') as fo:
|
||||||
fo.write("<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset={}\">\n<title>untitled</title>\n</head>\n".format(encoding))
|
fo.write("<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset={}\">\n<title>untitled</title>\n</head>\n".format(encoding))
|
||||||
|
|
||||||
body = False
|
body = False
|
||||||
skip_tag = False
|
skip_tag = False
|
||||||
nb_lines = 0
|
nb_lines = 0
|
||||||
|
|
||||||
for line_in_bytes in input_file:
|
for line_in in fi:
|
||||||
line_in = line_in_bytes.decode('utf-8')
|
line_in = html.unescape(line_in)
|
||||||
line_out = ""
|
line_out = ""
|
||||||
|
|
||||||
if not body:
|
if not body:
|
||||||
|
|||||||
Reference in New Issue
Block a user