renforcer les filtres sur les tags
This commit is contained in:
@@ -33,7 +33,7 @@ def home(request):
|
||||
message = "Votre fichier est converti et nettoyé : " + output_name
|
||||
|
||||
return {
|
||||
'page_title': "HTML cleanup 1.2",
|
||||
'page_title': "HTML cleanup 26.01",
|
||||
'message': message,
|
||||
'file_url': file_url,
|
||||
'file_name': output_name,
|
||||
@@ -119,13 +119,37 @@ def process_file(request, input_file, input_name):
|
||||
return message, input_encoding, output_name
|
||||
|
||||
def clean_html(input_file, output_file):
|
||||
#
|
||||
# cleanup undesirable tags in html file
|
||||
#
|
||||
#20241118: thn: changed chardet to charset_normalizer
|
||||
#20251216: thn: changed align:"justify" to style="text-align:justify;" in <p>,
|
||||
#because it was deprecated in HTML 5
|
||||
#accepted id in <span, p, h1, ... h6>
|
||||
#20251217: thn, replaced re.match by re.search
|
||||
#20251223: thn, accepted hyphen in id. Avoid: [.:] because they are using in CSS
|
||||
#20251224: check input syntax for id
|
||||
#Only support <tag ....> in one line
|
||||
#20260104: treated "div" the same as "span", "h1"..."h6"
|
||||
#20260104: removed "span", </span>
|
||||
#20260122: move the function check_input_name into the function clean_html as inner function
|
||||
#version: 3_5_3_2 (26.01)
|
||||
|
||||
def check_input_name(attribute):
|
||||
ret_flag=False
|
||||
name_regex=r'.*=\s*\"(.*)\".*'
|
||||
matched=re.search(name_regex, attribute)
|
||||
if matched:
|
||||
if matched.group(1)[0].isalpha():
|
||||
ret_flag=True
|
||||
return ret_flag
|
||||
|
||||
|
||||
encoding = "utf-8"
|
||||
message = ""
|
||||
|
||||
try:
|
||||
with open(input_file, 'r') as fi, open(output_file, 'w') as fo:
|
||||
fo.write("<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset={}\">\n<title>untitled</title>\n</head>\n".format(encoding))
|
||||
|
||||
body = False
|
||||
skip_tag = False
|
||||
@@ -136,9 +160,11 @@ def clean_html(input_file, output_file):
|
||||
line_out = ""
|
||||
|
||||
if not body:
|
||||
init = line_in.find("<body")
|
||||
if init != -1 and init == 0:
|
||||
init = re.search(r'^\s*<body.*',line_in)
|
||||
if init:
|
||||
body = True
|
||||
fo.write("<html>\n\t<head>\n\t\t<meta http-equiv=\"Content-Type\" content=\"text/html; charset={}\">\n\t\t<title>untitled</title>\n\t</head>\n".format(encoding))
|
||||
init=0
|
||||
|
||||
if body:
|
||||
nb_lines += 1
|
||||
@@ -196,27 +222,26 @@ def clean_html(input_file, output_file):
|
||||
|
||||
elif tag[0] in ["span", "p", "h1", "h2", "h3", "h4", "h5", "h6"]:
|
||||
line_out += "<{}".format(tag[0])
|
||||
if tag[0] == "p":
|
||||
# ajouter text-align dans p
|
||||
line_out += ' style="text-align:justify;"'
|
||||
if len(tag) > 1:
|
||||
# 26.01 : ne plus ajouter text-align dans <p>
|
||||
# if tag[0] == "p":
|
||||
# line_out += ' style="text-align:justify;"'
|
||||
if len(tag) > 1 and not skip_tag:
|
||||
matched = re.search(id_match_regex, line_in)
|
||||
if matched:
|
||||
line_out += ' '+ matched.group(1)
|
||||
if check_input_name(matched.group(1)):
|
||||
line_out += ' '+ matched.group(1)
|
||||
line_out += '>'
|
||||
|
||||
elif tag[0] in ["b", "/b", "em", "/em", "i", "/i", "li", "li/", "/li", "ol", "/ol",
|
||||
"ul", "/ul", "strong", "/strong", "sub", "/sub", "sup", "/sup", "u", "/u"]:
|
||||
elif tag[0] in ["b", "/b", "em", "/em", "i", "/i", "li", "/li", "ol", "/ol",
|
||||
"ul", "/ul", "strong", "/strong", "sub", "/sub", "sup", "/sup",
|
||||
"u", "/u"]:
|
||||
line_out += "<{}>".format(tag[0])
|
||||
|
||||
elif tag[0] in ["/span", "/h1", "/h2", "/h3", "/h4", "/h5", "/h6", "/p", "body", "/body",
|
||||
"/html", "/div", "/img", "/a"]:
|
||||
elif tag[0] in ["/h1", "/h2", "/h3", "/h4", "/h5", "/h6", "/p",
|
||||
"body", "/body", "/html", "/div", "/img", "/a"]:
|
||||
line_out += "<{}>".format(tag[0])
|
||||
|
||||
elif tag[0] in ["table", "/table", "td", "/td", "th", "/th", "tr", "/tr"]:
|
||||
line_out += "<{}>".format(tag[0])
|
||||
|
||||
elif tag[0] in ["br", "br/"]:
|
||||
elif tag[0] in ["br", "br/", "hr"]:
|
||||
line_out += "<{}>".format(tag[0])
|
||||
|
||||
if line_out != "":
|
||||
|
||||
Reference in New Issue
Block a user