extract and analyze pdf file

This commit is contained in:
2019-08-01 21:39:52 +02:00
parent 6e9b096326
commit e912cca1f2
3 changed files with 180 additions and 40 deletions

View File

@@ -1,11 +1,6 @@
<metal:block use-macro="main_template"> <metal:block use-macro="main_template">
<div metal:fill-slot="content"> <div metal:fill-slot="content">
<p>
<a href="${request.application_url}/" class="btn btn-default" role="button">
<span class="glyphicon glyphicon-chevron-left"></span> Retour</a>
</p>
<table id="users_list" class="table table-striped table-bordered"> <table id="users_list" class="table table-striped table-bordered">
<thead> <thead>
<tr> <tr>
@@ -19,6 +14,20 @@
</table> </table>
<br /> <br />
<form id="change-dossier-details-form" action="${url}" method="post" tal:condition="dt_data"
data-fv-framework="bootstrap"
data-fv-icon-valid="glyphicon glyphicon-ok"
data-fv-icon-invalid="glyphicon glyphicon-remove"
data-fv-icon-validating="glyphicon glyphicon-refresh">
<div class="form-group">
<a href="${request.application_url}/" class="btn btn-default" role="button">
<span class="glyphicon glyphicon-chevron-left"></span> Retour</a>
<button class="btn btn-success" type="submit" name="form.submitted">
<span class="glyphicon glyphicon-download-alt"></span>&nbsp;Générer les dossiers</button>
</div>
</form>
<br /> <br />
<script type="text/javascript"> <script type="text/javascript">

View File

@@ -82,7 +82,7 @@
<a class="btn btn-default" href="${request.application_url}/dossier_view/${nodossier}"> <a class="btn btn-default" href="${request.application_url}/dossier_view/${nodossier}">
<span class="glyphicon glyphicon-chevron-left"></span> Annuler</a> <span class="glyphicon glyphicon-chevron-left"></span> Annuler</a>
<button class="btn btn-primary" type="submit" name="form.submitted"> <button class="btn btn-primary" type="submit" name="form.submitted">
<span class="glyphicon glyphicon-ok"></span>&nbsp;Enregistrer</button> <span class="glyphicon glyphicon-ok"></span>&nbsp;Enregistrer</button>
</div> </div>
</div> </div>
</form> </form>

View File

@@ -22,12 +22,21 @@ from sqlalchemy.exc import DBAPIError
from ..security import groupfinder from ..security import groupfinder
import os import os
import io
import shutil import shutil
import pdfkit import pdfkit
import imaplib import imaplib
import base64 import base64
import email import email
from pdfminer3.layout import LAParams, LTTextBox, LTTextLine
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfparser import PDFParser
from pdfminer3.pdfdocument import PDFDocument
from pdfminer3.pdfdevice import PDFDevice
from pdfminer3.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from ..views.default import * from ..views.default import *
from ..models.default import * from ..models.default import *
from ..models.dossier import * from ..models.dossier import *
@@ -714,36 +723,7 @@ def rdf_bill(request):
@view_config(route_name='demandes', renderer='../templates/dossier/demandes.pt', permission='view') @view_config(route_name='demandes', renderer='../templates/dossier/demandes.pt', permission='view')
def demandes(request): def demandes(request):
url = request.route_url('demandes')
def process_messages(data, search_criteria, liste):
# créer la liste des entêtes des messages à afficher
rv, data = conn.search(None, search_criteria)
if rv != 'OK':
request.session.flash("ERREUR de lecture de la boîte de réception", 'danger')
return HTTPFound(location=request.route_url('home'))
mail_ids = data[0]
for num in mail_ids.split():
rv, msg_data = conn.fetch(num, '(RFC822)')
if rv != 'OK':
request.session.flash("ERREUR de lecture du message %s" % num, 'danger')
return HTTPFound(location=request.route_url('home'))
msg = email.message_from_bytes(msg_data[0][1])
hdr = email.header.make_header(email.header.decode_header(msg['Subject']))
email_subject = str(hdr)
email_from = email.utils.parseaddr(msg['from'])[1]
import pdb;pdb.set_trace()
# Now convert to local date-time
date_tuple = email.utils.parsedate_tz(msg['Date'])
if date_tuple:
email_date = datetime.fromtimestamp(email.utils.mktime_tz(date_tuple))
else:
email_date = datetime.now()
d = (str(int(num)), email_date.strftime('%d-%m-%Y %H:%M:%S'), email_from, mbx_name.replace('entreprise-dumas.com', ''), email_subject)
liste.append(d)
return liste
# lire les demandes d'interventions arrivées par email # lire les demandes d'interventions arrivées par email
mbx_name = 'peinture-dumas@entreprise-dumas.com' mbx_name = 'peinture-dumas@entreprise-dumas.com'
@@ -761,16 +741,167 @@ def demandes(request):
liste=[] liste=[]
# lire demandes de la MAIF # lire demandes de la MAIF
mbx_subject = 'FROM gestionsinistre@maif.fr SUBJECT "Missionnement r"' mbx_search = 'FROM gestionsinistre@maif.fr SUBJECT "Missionnement r"'
process_messages(data, mbx_subject, liste) if 'form.submitted' in request.params:
demandes_generer(conn, mbx_name, mbx_search, liste)
demandes_afficher(conn, mbx_name, mbx_search, liste)
# lire demandes de DOMUS # lire demandes de DOMUS
mbx_subject = 'FROM service.sinistres@domus-services.fr SUBJECT "Ordre de mission DOMUS - Dossier"' mbx_search = 'FROM service.sinistres@domus-services.fr SUBJECT "Ordre de mission DOMUS - Dossier"'
process_messages(data, mbx_subject, liste) demandes_afficher(conn, mbx_name, mbx_search, liste)
conn.logout() conn.logout()
return { return {
'page_title': 'Liste des demandes pour la PEINTURE', 'page_title': 'Liste des demandes pour la PEINTURE',
'url': url,
'dt_data': json.dumps(liste), 'dt_data': json.dumps(liste),
} }
def demandes_afficher(conn, mbx_name, search_criteria, liste):
# créer la liste des entêtes des messages à afficher
rv, data = conn.search(None, search_criteria)
if rv != 'OK':
request.session.flash("ERREUR de lecture de la boîte de réception", 'danger')
return HTTPFound(location=request.route_url('home'))
mail_ids = data[0]
for num in mail_ids.split():
rv, msg_data = conn.fetch(num, '(RFC822)')
if rv != 'OK':
request.session.flash("ERREUR de lecture du message %s" % num, 'danger')
return HTTPFound(location=request.route_url('home'))
msg = email.message_from_bytes(msg_data[0][1])
hdr = email.header.make_header(email.header.decode_header(msg['Subject']))
email_subject = str(hdr)
email_from = email.utils.parseaddr(msg['from'])[1]
# Now convert to local date-time
date_tuple = email.utils.parsedate_tz(msg['Date'])
if date_tuple:
email_date = datetime.fromtimestamp(email.utils.mktime_tz(date_tuple))
else:
email_date = datetime.now()
d = (str(int(num)), email_date.strftime('%d-%m-%Y %H:%M:%S'), email_from, mbx_name.replace('entreprise-dumas.com', ''), email_subject)
liste.append(d)
return liste
def demandes_generer(conn, mbx_name, search_criteria, liste):
def download_pdf_to_tmp(email_message):
# downloading attachments
for part in email_message.walk():
# this part comes from the snipped I don't understand yet...
if part.get_content_maintype() == 'multipart':
continue
if part.get('Content-Disposition') is None:
continue
fileName = part.get_filename()
if bool(fileName):
# copier le fichier PDF dans le dossier /tmp
filePath = os.path.join('/tmp/', fileName)
if not os.path.isfile(filePath) :
fp = open(filePath, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()
return filePath
def convert_pdf_to_txt(path):
resource_manager = PDFResourceManager()
laparams = LAParams()
converter = PDFPageAggregator(resource_manager, laparams=laparams)
page_interpreter = PDFPageInterpreter(resource_manager, converter)
extracted_text = ""
with open(path, 'rb') as fh:
for page in PDFPage.get_pages(fh,
caching=True,
check_extractable=True):
page_interpreter.process_page(page)
# The converter renders the layout from interpreter
layout = converter.get_result()
# Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
extracted_text += lt_obj.get_text()
# close open handles
converter.close()
# ecrire le texte dans un fichier
extracted_file = '/tmp/log_file.txt'
with open(extracted_file, "w") as my_log:
my_log.write(extracted_text)
my_log.close()
return extracted_text, extracted_file
def generer_dossier(mbx_name, filepath):
# parcourir les lignes pour retrouver les infos utiles
with open(filepath) as fp:
cnt = 1
line = fp.readline()
while line:
if line.find('Nos références') == 0:
line = fp.readline()
line = fp.readline()
no_sinistre = line[:-1]
if line.find('Bénéficiaire des travaux :') == 0:
elt = line.split(' : ')
nom = elt[1][:-1]
line = fp.readline()
line = fp.readline()
line = fp.readline()
adr = line[:-1]
line = fp.readline()
cp = line[0:5]
ville = line[6:-1]
if line.find('N° de téléphone :') == 0:
# les 10 derniers caratères
tel = line[-11:-1]
# lire ligne suivante
line = fp.readline()
cnt += 1
# créer un dem_devis
import pdb;pdb.set_trace()
return
# rechercher les emails de demandes dans le INBOX
rv, data = conn.search(None, search_criteria)
if rv != 'OK':
request.session.flash("ERREUR de lecture de la boîte de réception", 'danger')
return HTTPFound(location=request.route_url('home'))
mail_ids = data[0]
for num in mail_ids.split():
rv, msg_data = conn.fetch(num, '(RFC822)')
if rv != 'OK':
request.session.flash("ERREUR de lecture du message %s" % num, 'danger')
return HTTPFound(location=request.route_url('home'))
raw_email = msg_data[0][1]
# converts byte literal to string removing b''
raw_email_string = raw_email.decode('utf-8')
email_message = email.message_from_string(raw_email_string)
email_subject = email_message['subject']
# demande annulée ?
if email_subject.find('Annulation ') < 0:
# downloading attachment
filePath = download_pdf_to_tmp(email_message)
# convertir le fichier pdf en texte
texte, extracted_file = convert_pdf_to_txt(filePath)
# mission annulée
if 'Objet : ANNULATION MISSION' in texte:
# supprime le pdf
os.remove(filePath)
else:
# genere le dossier d'après
generer_dossier(mbx_name, extracted_file)
return