-
Notifications
You must be signed in to change notification settings - Fork 0
/
command_line.py
129 lines (105 loc) · 3.35 KB
/
command_line.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/python3
# -*- coding: utf-8 -*-
from io import BytesIO
from io import StringIO
import glob
import logging
import os
import sys
import docx
#from docx.enum.style import WD_STYLE_TYPE
from docx import Document
import adlamConversion
import ahomConversion
from mendeConverter import MendeConverter
import phkConversion
from convertDoc2 import ConvertDocx
# get uploaded file into document form
def createDocFromFile(file_path):
try:
file = open(file_path, 'rb')
text = file.read()
data = BytesIO(text)
count = len(text)
doc = Document(data)
file.close()
return doc, count
except BaseException as err:
print('Cannot create Docx for %s. Err = %s' % (file_path, err))
return None, -1
def convertThisDoc(lang, inputFileName):
new_doc = None
baseName = os.path.splitext(inputFileName)[0]
outFileName = baseName + '_Unicode.docx'
if baseName.find('Unicode') > 0:
return None
doc, fileSize = createDocFromFile(inputFileName)
if not doc:
logging.warning('No document %s opened: %s', inputFileName, docx)
return None
else:
logging.info('Doc created from %s', inputFileName)
langConvert = None
sentence_mode = False
if lang =='ff':
langConverter = adlamConversion.AdlamConverter()
sentence_mode = True
elif lang == 'aho':
langConverter = ahomConversion.AhomConverter()
elif lang == 'phk':
langConverter = phkConversion.PhakeConverter()
elif lang == 'men':
langConverter = MendeConverter()
langConverter.setScriptIndex(0)
langConverter.setLowerMode(True)
langConverter.setSentenceMode(sentence_mode)
try:
paragraphs = doc.paragraphs
count = len(paragraphs)
except AttributeError:
paragraphs = None
count = 0
pass
msgToSend = '%d paragraphs in %s\n' % (count, inputFileName)
countSent = 0
newProgressObj = None
docConverter = ConvertDocx(langConverter, documentIn=doc,
reportProgressObj=newProgressObj)
if docConverter:
result = docConverter.processDocx()
doc.save(outFileName)
else:
result = None
wordFrequencies = None
try:
wordFrequencies = langConverter.getSortedWordList()
if wordFrequencies:
# Do something with this information
words = [x[0] for x in wordFrequencies]
for item in wordFrequencies:
print(item)
except BaseException as err:
logging.warning('FAILED TO GET WORD LIST: %s' % err)
words = None
return result
def main(argv):
if len(argv) < 3:
print('Convert .docx files from font encodings to Unicode text')
print('Usage: python3 command_line lang_code file1 file2 file ...')
return
lang = argv[1]
doc_path = argv[2]
# For each item in the list, [2:...]
files = []
for doc_path in argv[2:]:
file_path = [doc_path]
if os.path.isdir(doc_path):
# Expand with glob
files.extend(glob.glob(doc_path + "/*.docx"))
else:
files.append(doc_path)
for file_path in files:
print('Converting %s in document %s' % (lang, file_path))
result = convertThisDoc(lang, file_path)
if __name__ == '__main__':
main(sys.argv)