forked from josifoski/SingleAuditRepo
-
Notifications
You must be signed in to change notification settings - Fork 8
/
file_checker_gp.py
92 lines (84 loc) · 3.92 KB
/
file_checker_gp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import csv
import io
import os
from time import sleep
from PyPDF4 import PdfFileReader
from azure.storage.file import FileService, Directory, File
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer3.converter import TextConverter
from pdfminer3.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer3.pdfpage import PDFPage
downloads_path = '/tmp/downloads/FromAzure/'
file_storage_user = ''
file_storage_pwd = ''
file_storage_share = ''
resource_manager = PDFResourceManager()
folders = ['2014', '2015', '2016', '2017', '2018', '2019']
directories = ['General Purpose']
def file_storage_connect():
file_service = FileService(account_name=file_storage_user, account_key=file_storage_pwd, socket_timeout=15)
try:
if file_service.exists(file_storage_share):
print('Connection to Azure file storage successfully established...')
else:
print('Filaed to connect to Asure file storage, share does not exist: ' + file_storage_share)
except Exception as ex:
print('Error connecting to Azure file storage: ', ex)
return file_service
def file_check(dir):
file_names = list(file_service.list_directories_and_files(file_storage_share, dir))
for file in file_names:
if isinstance(file, Directory):
print('Directory: ' + file.name)
if file.name not in folders:
continue
file_check(dir + '/' + file.name)
elif isinstance(file, File):
print('File: ' + file.name)
downloaded = file_service.get_file_to_path(file_storage_share, dir, file.name,
downloads_path + file.name, timeout=120)
if downloaded:
try:
with open(downloads_path + file.name, 'rb') as f:
pdf = PdfFileReader(f)
info = pdf.getDocumentInfo()
print(info)
except Exception as e:
print(e)
try:
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager, fake_file_handle)
page_interpreter = PDFPageInterpreter(resource_manager, converter)
with open(downloads_path + file.name, 'rb') as fh:
for page in PDFPage.get_pages(fh,
caching=True,
check_extractable=False):
page_interpreter.process_page(page)
break
text_from_pdf_miner = fake_file_handle.getvalue()
converter.close()
fake_file_handle.close()
except Exception as exx:
print(exx)
with open('corrupted_files/corrupted_files_general_purpose.csv', 'a', newline='') as fi:
writer = csv.writer(fi)
writer.writerow([file.name])
if os.path.exists(os.path.join(downloads_path + file.name)):
try:
os.remove(os.path.join(downloads_path, file.name))
except Exception as e:
print(e)
if not os.path.exists(os.path.join(downloads_path + file.name)):
print('Removed {}'.format(downloads_path + file.name))
if __name__ == '__main__':
if not os.path.exists(downloads_path):
os.makedirs(downloads_path)
file_service = file_storage_connect()
dirs = file_service.list_directories_and_files(file_storage_share)
for dir in dirs:
if dir.name not in directories:
continue
print('Directory: ' + dir.name)
file_check(dir.name)