diff --git a/development.ini b/development.ini index 8a33c74..edb6096 100644 --- a/development.ini +++ b/development.ini @@ -14,9 +14,8 @@ pyramid.default_locale_name = en pyramid.includes = pyramid_debugtoolbar -# By default, the toolbar only appears for clients from IP addresses -# '127.0.0.1' and '::1'. -# debugtoolbar.hosts = 127.0.0.1 ::1 +# temp folder location +temp_folder = /Users/phuoc/pyramid/html_cleanup/html_cleanup/static/temp/ ### # wsgi server configuration diff --git a/html_cleanup/static/temp/clean_Nantes_anthologie_cinema_viet.htm b/html_cleanup/static/temp/clean_Nantes_anthologie_cinema_viet.htm new file mode 100644 index 0000000..23b4f73 --- /dev/null +++ b/html_cleanup/static/temp/clean_Nantes_anthologie_cinema_viet.htm @@ -0,0 +1,5 @@ + + + +untitled + diff --git a/html_cleanup/static/temp/clean_www.icloud.com.html b/html_cleanup/static/temp/clean_www.icloud.com.html new file mode 100644 index 0000000..23b4f73 --- /dev/null +++ b/html_cleanup/static/temp/clean_www.icloud.com.html @@ -0,0 +1,5 @@ + + + +untitled + diff --git a/html_cleanup/static/temp/www.icloud.com.html b/html_cleanup/static/temp/www.icloud.com.html new file mode 100644 index 0000000..f7a55ae --- /dev/null +++ b/html_cleanup/static/temp/www.icloud.com.html @@ -0,0 +1,496 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + iCloud + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/html_cleanup/templates/404.jinja2 b/html_cleanup/templates/404.jinja2 index aaf1241..a9c7cce 100644 --- a/html_cleanup/templates/404.jinja2 +++ b/html_cleanup/templates/404.jinja2 @@ -2,7 +2,7 @@ {% block content %}
-

Pyramid Starter project

+

HTML Cleanup

404 Page Not Found

{% endblock content %} diff --git a/html_cleanup/templates/home.jinja2 b/html_cleanup/templates/home.jinja2 new file mode 100644 index 0000000..2bcb185 --- /dev/null +++ b/html_cleanup/templates/home.jinja2 @@ -0,0 +1,39 @@ +{% extends "layout.jinja2" %} + +{% block content %} + + {% if message %} +
+ {{ message }} +
+ {% endif %} + +
+
+ + +
+
+
+ +
+
+ + + + {% if file_url %} +

CLEANUP réussi

+ Votre fichier nettoyé : {{ file_name }}: + Télécharger + {% endif %} + +{% endblock content %} + + diff --git a/html_cleanup/templates/layout.jinja2 b/html_cleanup/templates/layout.jinja2 index ad1a574..63f50b3 100644 --- a/html_cleanup/templates/layout.jinja2 +++ b/html_cleanup/templates/layout.jinja2 @@ -24,19 +24,23 @@ + +
-
-
- {% block content %} -

No content

- {% endblock content %} -
-
- -
-
+ + {% if page_title %} +

{{ page_title }}

+
+ {% endif %} + + +
+ {% block content %} +

No content

+ {% endblock content %} +
+ +
diff --git a/html_cleanup/templates/mytemplate.jinja2 b/html_cleanup/templates/mytemplate.jinja2 deleted file mode 100644 index f2e7283..0000000 --- a/html_cleanup/templates/mytemplate.jinja2 +++ /dev/null @@ -1,8 +0,0 @@ -{% extends "layout.jinja2" %} - -{% block content %} -
-

Pyramid Starter project

-

Welcome to {{project}}, a Pyramid application generated by
Cookiecutter.

-
-{% endblock content %} diff --git a/html_cleanup/views/default.py b/html_cleanup/views/default.py index b2a1790..1ef91d0 100644 --- a/html_cleanup/views/default.py +++ b/html_cleanup/views/default.py @@ -1,6 +1,150 @@ from pyramid.view import view_config +from pyramid.httpexceptions import HTTPFound +import os +import shutil +import magic +import sys +@view_config(route_name='home', renderer='html_cleanup:templates/home.jinja2') +def home(request): -@view_config(route_name='home', renderer='html_cleanup:templates/mytemplate.jinja2') -def my_view(request): - return {'project': 'html_cleanup'} + message = '' + file_name = '' + file_url = '' + + if 'form.submitted' in request.params: + if request.POST['uploadfile'] != b'': + input_file = request.POST['uploadfile'].file + input_name = request.POST['uploadfile'].filename + # récupère le fichier download, faire les controles et traiter + message = process_file(request, input_file, input_name) + if message == '': + file_name = "clean_" + input_name + file_url = request.static_url('html_cleanup:static/temp/') + file_name + + return { + 'page_title': "HTML cleanup", + 'message': message, + 'file_url': file_url, + 'file_name': file_name, + } + +def clean_html(input_file, output_file): + # cleanup undesirable tags in html file + encoding = "utf-8" + message = "" + import pdb;pdb.set_trace() + + try: + with open(input_file, 'r', encoding='utf-8') as fi, open(output_file, 'w', encoding='utf-8') as fo: + fo.write("\n\n\nuntitled\n\n") + + body = False + skip_tag = False + nb_lines = 0 + + for line_in in fi: + line_out = "" + + if not body: + init = line_in.find("") + if start_from != -1 and start_from >= 0: + skip_tag = False + start_from = start_from + 1 if start_from < len(line_in) else 0 + else: + start_from = init if nb_lines == 1 else 0 + + if start_from >= 0: + done = False + while not done: + next_tag = line_in.find("<", start_from) + if next_tag == -1: + line_out += line_in[start_from:] + done = True + skip_tag = False + else: + if next_tag > start_from: + line_out += line_in[start_from:next_tag] + + end_tag = line_in.find(">", next_tag + 1) + if end_tag == -1: + s = line_in[next_tag + 1:] + done = True + skip_tag = True + else: + s = line_in[next_tag + 1:end_tag] + + if end_tag < len(line_in): + start_from = end_tag + 1 + done = False + skip_tag = False + else: + done = True + skip_tag = False + + tag = s.split(" ") + + if tag: + tag[0] = tag[0].lower() + + if tag[0] in ["table", "div", "img", "a"]: + line_out += "<{}>".format(s) + elif tag[0] in ["p"]: + line_out += "

" + elif tag[0] in ["b", "/b", "em", "/em", "i", "/i", "li", "li/", "/li", + "ol", "/ol", "ul", "/ul", "strong", "/strong", "sub", "/sub", + "sup", "/sup", "u", "/u"]: + line_out += "<{}>".format(tag[0]) + elif tag[0] in ["h1", "/h1", "h2", "/h2", "h3", "/h3", "h4", "/h4", "h5", + "/h5", "h6", "/h6", "/p", "body", "/body", "/html", "/div", + "/img", "/a"]: + line_out += "<{}>".format(tag[0]) + elif tag[0] in ["table", "/table", "td", "/td", "th", "/th", "tr", "/tr"]: + line_out += "<{}>".format(tag[0]) + elif tag[0] in ["br", "br/"]: + line_out += "<{}>".format(tag[0]) + + if line_out != "": + fo.write(line_out + "\n") + + except Exception as e: + message = "Error: {}".format(str(e)) + + return message + +def process_file(request, input_file, input_name): + # Check file mime type and size + # and if OK, process file + message = '' + ext_allowed = ['text/html', 'image/png', 'image/jpeg'] + max_size = 10 * (1024 ** 2) # 10 Mb + temp_folder = request.registry.settings['temp_folder'] + + mime = magic.from_buffer(input_file.read(), mime=True) + # types de fichiers autorisés ? + if mime not in ext_allowed: + message = "ERREUR: Le format du fichier n'est pas valide. Téléchargement refusé." + else: + # lire la taille du fichier + pos = input_file.tell() + input_file.seek(0, 2) #seek to end + filesize = input_file.tell() + input_file.seek(pos) # back to original position + # controler la taille du fichier + if filesize > max_size: + message = "ERREUR: La taille du fichier dépasse la limite autorisée. Téléchargement refusé." + else: + # controle OK, traiter le fichier + output_name = "clean_" + input_name + output_file = os.path.join(temp_folder, output_name) + message = clean_html(input_file, output_file) + + return message \ No newline at end of file diff --git a/html_cleanup/views/notfound.py b/html_cleanup/views/notfound.py index f96ca7b..59817e8 100644 --- a/html_cleanup/views/notfound.py +++ b/html_cleanup/views/notfound.py @@ -1,6 +1,5 @@ from pyramid.view import notfound_view_config - @notfound_view_config(renderer='html_cleanup:templates/404.jinja2') def notfound_view(request): request.response.status = 404 diff --git a/production.ini b/production.ini index 1e4e85e..7db9660 100644 --- a/production.ini +++ b/production.ini @@ -12,6 +12,9 @@ pyramid.debug_notfound = false pyramid.debug_routematch = false pyramid.default_locale_name = en +# temp folder location +temp_folder = /pyramid/html_cleanup/html_cleanup/static/temp/ + ### # wsgi server configuration ### diff --git a/setup.py b/setup.py index c288a4e..24eabe2 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,7 @@ requires = [ 'pyramid', 'pyramid_jinja2', 'pyramid_debugtoolbar', + 'python-magic', 'waitress', ]