renforcer les filtres sur les tags

This commit is contained in:
Phuoc Cao
2026-01-28 11:37:04 +01:00
parent f92b700dff
commit 92352450f6

View File

@@ -33,7 +33,7 @@ def home(request):
message = "Votre fichier est converti et nettoyé : " + output_name message = "Votre fichier est converti et nettoyé : " + output_name
return { return {
'page_title': "HTML cleanup 1.2", 'page_title': "HTML cleanup 26.01",
'message': message, 'message': message,
'file_url': file_url, 'file_url': file_url,
'file_name': output_name, 'file_name': output_name,
@@ -119,13 +119,37 @@ def process_file(request, input_file, input_name):
return message, input_encoding, output_name return message, input_encoding, output_name
def clean_html(input_file, output_file): def clean_html(input_file, output_file):
#
# cleanup undesirable tags in html file # cleanup undesirable tags in html file
#
#20241118: thn: changed chardet to charset_normalizer
#20251216: thn: changed align:"justify" to style="text-align:justify;" in <p>,
#because it was deprecated in HTML 5
#accepted id in <span, p, h1, ... h6>
#20251217: thn, replaced re.match by re.search
#20251223: thn, accepted hyphen in id. Avoid: [.:] because they are using in CSS
#20251224: check input syntax for id
#Only support <tag ....> in one line
#20260104: treated "div" the same as "span", "h1"..."h6"
#20260104: removed "span", </span>
#20260122: move the function check_input_name into the function clean_html as inner function
#version: 3_5_3_2 (26.01)
def check_input_name(attribute):
ret_flag=False
name_regex=r'.*=\s*\"(.*)\".*'
matched=re.search(name_regex, attribute)
if matched:
if matched.group(1)[0].isalpha():
ret_flag=True
return ret_flag
encoding = "utf-8" encoding = "utf-8"
message = "" message = ""
try: try:
with open(input_file, 'r') as fi, open(output_file, 'w') as fo: with open(input_file, 'r') as fi, open(output_file, 'w') as fo:
fo.write("<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset={}\">\n<title>untitled</title>\n</head>\n".format(encoding))
body = False body = False
skip_tag = False skip_tag = False
@@ -136,9 +160,11 @@ def clean_html(input_file, output_file):
line_out = "" line_out = ""
if not body: if not body:
init = line_in.find("<body") init = re.search(r'^\s*<body.*',line_in)
if init != -1 and init == 0: if init:
body = True body = True
fo.write("<html>\n\t<head>\n\t\t<meta http-equiv=\"Content-Type\" content=\"text/html; charset={}\">\n\t\t<title>untitled</title>\n\t</head>\n".format(encoding))
init=0
if body: if body:
nb_lines += 1 nb_lines += 1
@@ -196,32 +222,31 @@ def clean_html(input_file, output_file):
elif tag[0] in ["span", "p", "h1", "h2", "h3", "h4", "h5", "h6"]: elif tag[0] in ["span", "p", "h1", "h2", "h3", "h4", "h5", "h6"]:
line_out += "<{}".format(tag[0]) line_out += "<{}".format(tag[0])
if tag[0] == "p": # 26.01 : ne plus ajouter text-align dans <p>
# ajouter text-align dans p # if tag[0] == "p":
line_out += ' style="text-align:justify;"' # line_out += ' style="text-align:justify;"'
if len(tag) > 1: if len(tag) > 1 and not skip_tag:
matched = re.search(id_match_regex, line_in) matched = re.search(id_match_regex, line_in)
if matched: if matched:
line_out += ' '+ matched.group(1) if check_input_name(matched.group(1)):
line_out += ' '+ matched.group(1)
line_out += '>' line_out += '>'
elif tag[0] in ["b", "/b", "em", "/em", "i", "/i", "li", "li/", "/li", "ol", "/ol", elif tag[0] in ["b", "/b", "em", "/em", "i", "/i", "li", "/li", "ol", "/ol",
"ul", "/ul", "strong", "/strong", "sub", "/sub", "sup", "/sup", "u", "/u"]: "ul", "/ul", "strong", "/strong", "sub", "/sub", "sup", "/sup",
"u", "/u"]:
line_out += "<{}>".format(tag[0]) line_out += "<{}>".format(tag[0])
elif tag[0] in ["/h1", "/h2", "/h3", "/h4", "/h5", "/h6", "/p",
elif tag[0] in ["/span", "/h1", "/h2", "/h3", "/h4", "/h5", "/h6", "/p", "body", "/body", "body", "/body", "/html", "/div", "/img", "/a"]:
"/html", "/div", "/img", "/a"]:
line_out += "<{}>".format(tag[0]) line_out += "<{}>".format(tag[0])
elif tag[0] in ["table", "/table", "td", "/td", "th", "/th", "tr", "/tr"]: elif tag[0] in ["table", "/table", "td", "/td", "th", "/th", "tr", "/tr"]:
line_out += "<{}>".format(tag[0]) line_out += "<{}>".format(tag[0])
elif tag[0] in ["br", "br/", "hr"]:
elif tag[0] in ["br", "br/"]:
line_out += "<{}>".format(tag[0]) line_out += "<{}>".format(tag[0])
if line_out != "": if line_out != "":
fo.write(line_out + "\n") fo.write(line_out + "\n")
except Exception as e: except Exception as e:
message = "Error: {}".format(str(e)) message = "Error: {}".format(str(e))