168 lines
6.8 KiB
Python
168 lines
6.8 KiB
Python
from pyramid.view import view_config
|
|
from pyramid.httpexceptions import HTTPFound
|
|
import os
|
|
import shutil
|
|
import magic
|
|
import sys
|
|
import datetime
|
|
|
|
@view_config(route_name='home', renderer='html_cleanup:templates/home.jinja2')
|
|
def home(request):
|
|
|
|
message = ''
|
|
file_name = ''
|
|
file_url = ''
|
|
|
|
if 'form.submitted' in request.params:
|
|
if request.POST['uploadfile'] != b'':
|
|
input_file = request.POST['uploadfile'].file
|
|
input_name = request.POST['uploadfile'].filename
|
|
# récupère le fichier download, faire les controles et traiter
|
|
message = process_file(request, input_file, input_name)
|
|
if message == '':
|
|
file_name = "clean_" + input_name
|
|
file_url = request.static_url('html_cleanup:static/temp/') + file_name
|
|
|
|
return {
|
|
'page_title': "HTML cleanup",
|
|
'message': message,
|
|
'file_url': file_url,
|
|
'file_name': file_name,
|
|
}
|
|
|
|
def clean_html(input_file, output_file):
|
|
# cleanup undesirable tags in html file
|
|
encoding = "utf-8"
|
|
message = ""
|
|
# import pdb;pdb.set_trace()
|
|
|
|
try:
|
|
with open(output_file, 'w') as fo:
|
|
fo.write("<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset={}\">\n<title>untitled</title>\n</head>\n".format(encoding))
|
|
|
|
body = False
|
|
skip_tag = False
|
|
nb_lines = 0
|
|
|
|
for line_in_bytes in input_file:
|
|
line_in = line_in_bytes.decode('utf-8')
|
|
line_out = ""
|
|
|
|
if not body:
|
|
init = line_in.find("<body")
|
|
if init != -1 and init == 0:
|
|
body = True
|
|
|
|
if body:
|
|
nb_lines += 1
|
|
|
|
if skip_tag:
|
|
start_from = line_in.find(">")
|
|
if start_from != -1 and start_from >= 0:
|
|
skip_tag = False
|
|
start_from = start_from + 1 if start_from < len(line_in) else 0
|
|
else:
|
|
start_from = init if nb_lines == 1 else 0
|
|
|
|
if start_from >= 0:
|
|
done = False
|
|
while not done:
|
|
next_tag = line_in.find("<", start_from)
|
|
if next_tag == -1:
|
|
line_out += line_in[start_from:]
|
|
done = True
|
|
skip_tag = False
|
|
else:
|
|
if next_tag > start_from:
|
|
line_out += line_in[start_from:next_tag]
|
|
|
|
end_tag = line_in.find(">", next_tag + 1)
|
|
if end_tag == -1:
|
|
s = line_in[next_tag + 1:]
|
|
done = True
|
|
skip_tag = True
|
|
else:
|
|
s = line_in[next_tag + 1:end_tag]
|
|
|
|
if end_tag < len(line_in):
|
|
start_from = end_tag + 1
|
|
done = False
|
|
skip_tag = False
|
|
else:
|
|
done = True
|
|
skip_tag = False
|
|
|
|
tag = s.split(" ")
|
|
|
|
if tag:
|
|
tag[0] = tag[0].lower()
|
|
|
|
if tag[0] in ["table", "div", "img", "a"]:
|
|
line_out += "<{}>".format(s)
|
|
elif tag[0] in ["p"]:
|
|
line_out += "<p align=\"justify\">"
|
|
elif tag[0] in ["b", "/b", "em", "/em", "i", "/i", "li", "li/", "/li",
|
|
"ol", "/ol", "ul", "/ul", "strong", "/strong", "sub", "/sub",
|
|
"sup", "/sup", "u", "/u"]:
|
|
line_out += "<{}>".format(tag[0])
|
|
elif tag[0] in ["h1", "/h1", "h2", "/h2", "h3", "/h3", "h4", "/h4", "h5",
|
|
"/h5", "h6", "/h6", "/p", "body", "/body", "/html", "/div",
|
|
"/img", "/a"]:
|
|
line_out += "<{}>".format(tag[0])
|
|
elif tag[0] in ["table", "/table", "td", "/td", "th", "/th", "tr", "/tr"]:
|
|
line_out += "<{}>".format(tag[0])
|
|
elif tag[0] in ["br", "br/"]:
|
|
line_out += "<{}>".format(tag[0])
|
|
|
|
if line_out != "":
|
|
fo.write(line_out + "\n")
|
|
|
|
except Exception as e:
|
|
message = "Error: {}".format(str(e))
|
|
|
|
return message
|
|
|
|
def process_file(request, input_file, input_name):
|
|
# Check file mime type and size
|
|
# and if OK, process file
|
|
message = ''
|
|
ext_allowed = ['text/html', 'image/png', 'image/jpeg']
|
|
max_size = 5 * (1024 ** 2) # 10 Mb
|
|
temp_folder = request.registry.settings['temp_folder']
|
|
logfile_name = os.path.join(temp_folder, 'errors_log')
|
|
|
|
mime = magic.from_buffer(input_file.read(), mime=True)
|
|
# types de fichiers autorisés ?
|
|
if mime not in ext_allowed:
|
|
message = "Le format du fichier n'est pas valide. Téléchargement refusé."
|
|
add_error2log(logfile_name, input_name, message)
|
|
else:
|
|
# lire la taille du fichier
|
|
input_file.seek(0, 2) #seek to end
|
|
filesize = input_file.tell()
|
|
input_file.seek(0) # back to original position
|
|
# controler la taille du fichier
|
|
if filesize > max_size:
|
|
message = "La taille du fichier dépasse la limite autorisée. Téléchargement refusé."
|
|
add_error2log(logfile_name, input_name, message)
|
|
else:
|
|
# controle OK, traiter le fichier
|
|
output_name = "clean_" + input_name
|
|
output_file = os.path.join(temp_folder, output_name)
|
|
message = clean_html(input_file, output_file)
|
|
if message:
|
|
add_error2log(logfile_name, input_name, message)
|
|
|
|
|
|
return message
|
|
|
|
def add_error2log(logfile_name, input_name, message):
|
|
# ecrire message d'erreur dans le log
|
|
|
|
|
|
# Get the current date and time
|
|
now = datetime.datetime.now()
|
|
with open(logfile_name, 'a', encoding='utf-8') as file:
|
|
line = now.strftime("%Y-%m-%d %H:%M:%S") + ' [' + input_name + '] ' + f'{message}\n'
|
|
file.write(line)
|