renforcer les filtres sur les tags

2026-01-28 11:37:04 +01:00
parent f92b700dff
commit 92352450f6
1 changed files with 44 additions and 19 deletions
--- a/html_cleanup/views/default.py
+++ b/html_cleanup/views/default.py
@@ -33,7 +33,7 @@ def home(request):
                    message = "Votre fichier est converti et nettoyé : " + output_name

    return {
-        'page_title': "HTML cleanup 1.2",
+        'page_title': "HTML cleanup 26.01",
        'message': message,
        'file_url': file_url,
        'file_name': output_name,
@@ -119,13 +119,37 @@ def process_file(request, input_file, input_name):
    return message, input_encoding, output_name

 def clean_html(input_file, output_file):
+    #
    # cleanup undesirable tags in html file
+    #
+    #20241118: thn: changed chardet to charset_normalizer
+    #20251216: thn: changed align:"justify" to style="text-align:justify;" in <p>,
+    #because it was deprecated in HTML 5
+    #accepted id in <span, p, h1, ... h6>
+    #20251217: thn, replaced re.match by re.search
+    #20251223: thn, accepted hyphen in id. Avoid: [.:] because they are using in CSS
+    #20251224: check input syntax for id
+    #Only support <tag ....> in one line
+    #20260104: treated "div" the same as "span", "h1"..."h6"
+    #20260104: removed "span", </span>
+    #20260122: move the function check_input_name into the function clean_html as inner function
+    #version: 3_5_3_2 (26.01)
+
+    def check_input_name(attribute):
+        ret_flag=False
+        name_regex=r'.*=\s*\"(.*)\".*'
+        matched=re.search(name_regex, attribute)
+        if matched:
+            if matched.group(1)[0].isalpha():
+                ret_flag=True
+        return ret_flag
+
+
    encoding = "utf-8"
    message = ""
 
    try:
        with open(input_file, 'r') as fi, open(output_file, 'w') as fo:
-            fo.write("<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset={}\">\n<title>untitled</title>\n</head>\n".format(encoding))

            body = False
            skip_tag = False
@@ -136,9 +160,11 @@ def clean_html(input_file, output_file):
                line_out = ""

                if not body:
-                    init = line_in.find("<body")
-                    if init != -1 and init == 0:
+                    init = re.search(r'^\s*<body.*',line_in)
+                    if init:
                        body = True
+                        fo.write("<html>\n\t<head>\n\t\t<meta http-equiv=\"Content-Type\" content=\"text/html; charset={}\">\n\t\t<title>untitled</title>\n\t</head>\n".format(encoding))
+                        init=0

                if body:
                    nb_lines += 1
@@ -196,27 +222,26 @@ def clean_html(input_file, output_file):

                                    elif tag[0] in ["span", "p", "h1", "h2", "h3", "h4", "h5", "h6"]:
                                        line_out += "<{}".format(tag[0])
-                                        if tag[0] == "p":
-                                            # ajouter text-align dans p
-                                            line_out += ' style="text-align:justify;"'
-                                        if len(tag) > 1:
+                                        # 26.01 : ne plus ajouter text-align dans <p>
+                                        # if tag[0] == "p": 
+                                        #    line_out += ' style="text-align:justify;"'
+                                        if len(tag) > 1 and not skip_tag:
                                            matched = re.search(id_match_regex, line_in)
                                            if matched:
-                                               line_out += ' '+ matched.group(1)
+                                                if check_input_name(matched.group(1)):
+                                                    line_out += ' '+ matched.group(1)
                                        line_out += '>'

-                                    elif tag[0] in ["b", "/b", "em", "/em", "i", "/i", "li", "li/", "/li", "ol", "/ol",
-                                                    "ul", "/ul", "strong", "/strong", "sub", "/sub", "sup", "/sup", "u", "/u"]:
+                                    elif tag[0] in ["b", "/b", "em", "/em", "i", "/i", "li", "/li", "ol", "/ol",
+                                                    "ul", "/ul", "strong", "/strong", "sub", "/sub", "sup", "/sup",
+                                                    "u", "/u"]:
                                        line_out += "<{}>".format(tag[0])
-
-                                    elif tag[0] in ["/span", "/h1", "/h2", "/h3", "/h4", "/h5",  "/h6", "/p", "body", "/body", 
-                                                    "/html", "/div", "/img", "/a"]:
+                                    elif tag[0] in ["/h1", "/h2", "/h3", "/h4", "/h5",  "/h6", "/p",
+                                                    "body", "/body", "/html", "/div", "/img", "/a"]:
                                        line_out += "<{}>".format(tag[0])
-
                                    elif tag[0] in ["table", "/table", "td", "/td", "th", "/th", "tr", "/tr"]:
                                        line_out += "<{}>".format(tag[0])
-
-                                    elif tag[0] in ["br", "br/"]:
+                                    elif tag[0] in ["br", "br/", "hr"]:
                                        line_out += "<{}>".format(tag[0])

                        if line_out != "":