
| Current Path : /home/cgabriel/20_dev/11_iftlib/sysadmin/ |
Linux ift1.ift-informatik.de 5.4.0-216-generic #236-Ubuntu SMP Fri Apr 11 19:53:21 UTC 2025 x86_64 |
| Current File : //home/cgabriel/20_dev/11_iftlib/sysadmin/util.py |
# coding: utf8
import os,sys,glob,sys,re,random,time
class Util (object):
def __init__ (self):
self.IFT = "ift"
self.colormode = "LineArt"
self.TESSERACT = "tesseract -l deu --psm 11 pdf"
self.TESSERACT1 = "tesseract -l eng --psm 11 pdf"
#***********************************************************
def subtree (self,pars):
prefix = pars[0]
prefix = prefix + "/"
prefix = re.sub(r"([\\\/])[\\\/]","\\1",prefix,99)
print(prefix)
m = re.search(r"^(.*)[\\\/](.*)[\\\/]$",prefix)
print(prefix)
if m:
newbranch = m.group(2)
else:
newbranch = prefix[:-1]
print(prefix,newbranch)
os.system("git branch -D " + newbranch)
os.system("git subtree --prefix " + prefix + " split -b " + newbranch)
print("111")
os.system("mkdir ~/" + newbranch)
os.system("git init -bare ~/" + newbranch)
os.system("git push ~/" + newbranch + " " + newbranch + ":master")
os.system("git branch -D " + newbranch)
os.system("chdir ~/" + newbranch + "; git checkout master")
#***********************************************************
def del_all (self,pars):
os.system('rm *.aux');
# os.system('rm *.log');
os.system('rm *.toc');
os.system('rm *.ind');
os.system('rm *.ilg');
os.system('rm *.idx');
os.system('rm *.bbc');
os.system('rm *.lvz');
os.system('rm *~');
os.system('rm */*~');
os.system('rm */*/*~');
os.system('rm */*/*/*~');
os.system('rm */*/*/*/*~');
os.system('rm */*/*/*/*/*~');
os.system('rm */*/*/*/*/*/*~');
os.system('rm */*/*/*/*/*/*/*~');
os.system('rm */*/*/*/*/*/*/*/*~');
os.system('rm */*/*/*/*/*/*/*/*/*~');
os.system('rm *.pyc');
os.system('rm */*.pyc');
os.system('rm */*/*.pyc');
os.system('rm */*/*/*.pyc');
os.system('rm */*/*/*/*.pyc');
os.system('rm */*/*/*/*/*.pyc');
os.system('rm */*/*/*/*/*/*.pyc');
os.system('rm */*/*/*/*/*/*/*.pyc');
os.system('rm */*/*/*/*/*/*/*/*.pyc');
os.system('rm */*/*/*/*/*/*/*/*/*.pyc');
os.system('rm .*~');
os.system('rm */.*~');
os.system('rm */*/.*~');
os.system('rm */*/*/.*~');
os.system('rm */*/*/*/.*~');
os.system('rm */*/*/*/*/.*~');
os.system('rm */*/*/*/*/*/.*~');
os.system('rm */*/*/*/*/*/*/.*~');
os.system('rm */*/*/*/*/*/*/*/.*~');
os.system('rm */*/*/*/*/*/*/*/*/.*~');
os.system('rm texput.*');
os.system('rm xxqq.*');
os.system('rm *P_L_A_C_E_H_O_L_D_E_R*.*');
if len(pars) > 0:
os.system('rmdir */*/*/*/*/*/*/*/*/*/*/*/*/*/*/*/*/*/');
os.system('rmdir */*/*/*/*/*/*/*/*/*/*/*/*/*/*/*/*/');
os.system('rmdir */*/*/*/*/*/*/*/*/*/*/*/*/*/*/*/');
os.system('rmdir */*/*/*/*/*/*/*/*/*/*/*/*/*/*/');
os.system('rmdir */*/*/*/*/*/*/*/*/*/*/*/*/*/');
os.system('rmdir */*/*/*/*/*/*/*/*/*/*/*/*/');
os.system('rmdir */*/*/*/*/*/*/*/*/*/*/*/');
os.system('rmdir */*/*/*/*/*/*/*/*/*/*/');
os.system('rmdir */*/*/*/*/*/*/*/*/*/');
os.system('rmdir */*/*/*/*/*/*/*/*/');
os.system('rmdir */*/*/*/*/*/*/*/');
os.system('rmdir */*/*/*/*/*/*/');
os.system('rmdir */*/*/*/*/*/');
os.system('rmdir */*/*/*/*/');
os.system('rmdir */*/*/*/');
os.system('rmdir */*/*/');
os.system('rmdir */*/');
os.system('rmdir */');
self.pdffiles = {}
#***********************************************************
def addclip (self,pars):
text = ""
while (0 == 0):
text0 = text
text = ""
try:
text = open(pars[0]).read()
except:
pass
if not text == text0:
text1 = text
for o in (1,2):
text1 = re.sub(r"(^|\n)(\d\d\d\d\d\d\d\d) +(-?\d+\.\d\d) +(\S+) +(\S+) +(-?\d+\.\d\d) +(.*?)\n",
"\\1 \\3 \n",text1,99999999,flags=re.DOTALL)
text1 = re.sub(r"\s+"," ",text1,99999999,flags=re.DOTALL)
text1 = text1.strip()
text1 = text1.split(" ")
result = 0.00
exist_values = False
for entry in text1:
try:
result = result + float(entry)
exist_values = True
except:
pass
if not exist_values:
continue
result = "%3.2f" % result
print (result)
open(pars[0],"w").write("")
try:
open(pars[1],"w").write(result + "\n")
except:
pass
time.sleep(1)
#******************************************************
def normalize (self,pars):
for file in os.listdir('.'):
print(file)
file1 = file
file1 = re.sub(r"[ \\\+\(\)\'\°[\]\!\{\}\,~\&]+","_",file1,99999999)
file1 = re.sub(r"[éèê]","e",file1,99999999)
file1 = re.sub(r"[óòô]","o",file1,99999999)
file1 = re.sub(r"[áàâ]","a",file1,99999999)
file1 = re.sub(r"[úùû]","u",file1,99999999)
file1 = re.sub(r"[Ç]","C",file1,99999999)
file1 = re.sub(r"[íìîı]","i",file1,99999999)
file1 = re.sub(r"[İ]","I",file1,99999999)
file1 = re.sub(r"ä","ae",file1,99999999)
file1 = re.sub(r"ö","oe",file1,99999999)
file1 = re.sub(r"ü","ue",file1,99999999)
file1 = re.sub(r"Ä","Ae",file1,99999999)
file1 = re.sub(r"Ö","Oe",file1,99999999)
file1 = re.sub(r"Ü","Ue",file1,99999999)
file1 = re.sub(r"ş","s",file1,99999999)
file1 = re.sub(r"ß","ss",file1,99999999)
file1 = re.sub(r"_-_","__",file1,99999999)
file1 = re.sub(r"_-","__",file1,99999999)
file1 = re.sub(r"\:","__",file1,99999999)
if file == file1:
continue
print(file)
os.rename(file,file1)
#******************************************************
def dbl_files (self,pars):
'''
Identifies the double file candidates of a list of files
'''
if len(pars) == 0 or pars[0] == "-":
text = sys.stdin.read()
else:
text = open(pars[0]).read().split("\n")
files = {}
try:
text = text.split("\n")
except:
pass
list1 = {}
for file in text:
file = re.sub(r"^\.[\\\/]","",file)
if not os.path.isfile(file):
continue
filesize = os.path.getsize(file)
o = str(filesize) + open(file).read(100000)
m = re.search(r"(^|\/|\\)(\d\d\d\d\d\d)\.(qq_|qw_|)(\d+_\d\d)_",file)
if m:
o = m.group(2) + "__" + m.group(4) # inhaltliche Zuordnung bei Quittungen
try:
list1[o].append(file)
except:
list1[o] = [("%015u" % int(filesize)),file]
list2 = []
for id in list1:
if len(list1[id]) > 2:
list2.append( list1[id] )
list2.sort(key=lambda x:x[0])
for id in list2:
for id1 in id:
try:
print ( ("%16.6f" % (int(id1)/100000.0) ) )
except:
print (id1)
print ("")
#****************************************************
def utf (self,pars):
for file in glob.glob("*") + glob.glob("*/*"):
if not os.path.isfile(file):
continue
if re.search(r"^(.*)\~$",file):
continue
filetyp = os.popen("file -i '" + file + "'").read()
m = re.search(r"charset\=(\S+)",filetyp,re.DOTALL)
if m:
filecode = m.group(1)
if '8bit' in filecode:
filecode = 'iso-8859-1'
if not filecode == "utf-8" and ("iso" in filecode or "ascii" in filecode):
os.system("cp " + file + " " + file + "~")
print("iconv -f " + filecode + " -t utf-8 " + file + "\n")
os.system("iconv -f " + filecode + " -t utf-8 " + file +"~ > " + file)
#****************************************************
def scanc (self,pars):
self.colormode = "Color"
self.scan(pars)
#****************************************************
def scan (self,pars):
try:
scandir = pars[1]
except:
scandir = "/home/xxx01_scan"
try:
scandev = pars[2]
except:
scandev = "278"
if not os.path.isdir(scandir):
scandir = "."
devnull = ""
if scandir == "/home/01_scan":
devnull = " > /dev/null"
c1 = int(pars[0]) - 48 # den Eingabeparameter normieren auf die Zifferntaste
if c1 < 0:
c1 = c1 + 48
for sdev in [
'escl:https://192.168.153.73:443',
'escl:http://2a02:810d:9880:7fc2:82ce:62ff:fe5a:760c:8080',
'escl:http://192.168.153.73:8080',
'hpaio:/net/hp_colorlaserjet_mfp_m278-m281?ip=192.168.153.73' # find the scan device
]:
if not re.search(scandev,sdev):
continue
if not re.search(r"not +supported",os.popen("scanimage -d " + sdev + " / 2>&1").read()):
scan_device = sdev
break
print ("SCAN DEVICE: " + scan_device)
if c1 == 0: # leave the loop
open("exit.txt","w").write("1\n")
os.system("chmod 775 exit.txt")
if c1 in [5,7]: # get the scan file name
listdir = os.listdir(scandir)
zaehler = 0
for file in listdir:
m = re.search(r"hpscan(\d+)\.pdf$",file)
if m:
zaehler = max(zaehler,int(m.group(1)))
zaehler = zaehler + 1
datei = "hpscan" + ("%03u" % (int(zaehler)-0))
datei_1 = "hpscan" + ("%03u" % (int(zaehler)-1))
if os.path.isfile(scandir+"/double_"+datei+".pdf"):
os.unlink(scandir+"/double_"+datei+".pdf")
if os.path.isfile(scandir+"/ocr_"+datei+".pdf"):
os.unlink(scandir+"/ocr_"+datei+".pdf")
if os.path.isfile(scandir+"/ocr_double_"+datei+".pdf"):
os.unlink(scandir+"/ocr_double_"+datei+".pdf")
# print("cd "+scandir +"; hp-scan --size=a4 -d " + scan_device + # the SCAN
# [""," --adf"][int((c1-5)/2)] + " --mode LineArt -o" + datei +".pdf " + devnull + "; chmod 775 " + datei +".pdf")
os.system("cd "+scandir +"; hp-scan --size=a4 -d " + scan_device + # the SCAN
[""," --adf"][int((c1-5)/2)] + " --mode " + self.colormode + " -o" + datei +".pdf " + devnull + "; chmod 775 " + datei +".pdf")
self.beep([0,100,500,100])
self.scanconcat([datei_1+".pdf",datei+".pdf",scandir,c1])
#*****************************************************************************
def scanconcat (self,pars):
datei_1 = pars[0]
datei = pars[1]
try:
scandir = pars[2]
except:
scandir = "."
try:
c1 = pars[3]
except:
c1 = 7
text = os.popen("cd " + scandir + "; pdftk " + datei_1 + " cat 99999999 xyz.pdf 2>&1").read() # Check whether double-paged document
# print (text)
m1 = re.search(r"input +PDF +has. +(\d+) +pages",text) # is found
if m1 and not os.path.isfile(scandir+"/"+datei):
open(scandir+"/"+datei,"w").write("1\n")
text = os.popen("cd " + scandir + "; pdftk " + datei + " cat 99999999 xyz.pdf 2>&1").read()
# print (text)
m = re.search(r"input +PDF +has. +(\d+) +pages",text)
if c1 == 5 and m1 and int(m1.group(1)) == 1:
if os.path.isfile(scandir+"/append_"+datei_1):
os.system("cd " + scandir + "; pdftk append_" + datei_1 + " " + datei +
" cat output append_" + datei)
else:
os.system("cd " + scandir + "; pdftk " + datei_1 + " " + datei +
" cat output append_" + datei)
# print (c1,m.group(1),m1.group(1))
if c1 == 7 and m and m1 and abs(int(m.group(1))-int(m1.group(1))) < 2: # Page counts differs at most about 1
catparameter = []
(seitenzahl,offset) = ( int(m.group(1))+int(m1.group(1)) , 1-max(0,int(m.group(1))-int(m1.group(1))) )
while (0 == 0):
# print (catparameter)
newpages = [ int(len(catparameter)/2 + offset), int(seitenzahl - len(catparameter)/2) ]
if int(newpages[0]) > int(newpages[1]):
break
if newpages[0] == newpages[1] or int(newpages[0]) == 0:
newpages = [ newpages[0] ]
catparameter = catparameter + [str(newpages[0]),str(newpages[1])]
os.system("cd " + scandir + "; pdftk " + datei_1 + " " + datei + " cat output xyztmp.pdf")
os.system("cd " + scandir + "; pdftk xyztmp.pdf cat " + " ".join(catparameter) +
" output double_" + datei)
os.unlink(scandir+"/xyztmp.pdf")
print (scandir,datei)
# self.scan_ocr(scandir,datei)
# self.scan_ocr(scandir,"append_" + datei)
# self.scan_ocr(scandir,"double_" + datei)
#****************************************************
def xxscan_ocr (self,scandir,datei):
if not os.path.isfile(scandir+"/"+datei + ".pdf"):
return()
os.system("cd "+scandir +"; cp " + datei + ".pdf " + datei + "_1.pdf; " +
"abbyyocr -adt -rl German -ido -if " + datei + "_1.pdf -tet UTF8 -of " + datei + "_2.txt; ")
if not os.path.isfile(scandir+"/"+datei+"_2.txt"):
return()
text = open(scandir+"/"+datei+"_2.txt").read()
text = re.sub(r"ä","ae",text,99999999)
text = re.sub(r"ö","oe",text,99999999)
text = re.sub(r"ü","ue",text,99999999)
text = re.sub(r"Ä","Ae",text,99999999)
text = re.sub(r"Ö","Oe",text,99999999)
text = re.sub(r"Ü","Ue",text,99999999)
text = re.sub(r"ß","ss",text,99999999)
text = re.sub(r"\n","\n"*1000,text,99999999)
open(scandir+"/"+datei+"_2.txt","w").write(text)
os.system("cd "+scandir +"; a2ps -B -1 -l 99999999 -o " + datei + "_2.ps " + datei + "_2.txt; " +
"ps2pdf " + datei + "_2.ps " + datei + "_2.pdf; " +
"pdftk " + datei + "_1.pdf " + datei + "_2.pdf cat output ocr_" + datei + ".pdf")
print ("pdf file with ocr text appended ...")
os.unlink(scandir+"/"+datei + "_1.pdf")
os.unlink(scandir+"/"+datei + "_2.pdf")
os.unlink(scandir+"/"+datei + "_2.txt")
os.unlink(scandir+"/"+datei + "_2.ps")
#****************************************************
def xxocr (self,pars):
datei = pars[0]
if not os.path.isfile(datei):
return()
try:
abbyy_par = pars[1]
except:
abbyy_par = " -adt -rl German "
text0 = open(datei).read()
streams = ""
while (0 == 0):
m = re.search(r"^(.*?\s)stream\s*(.*?)\s+endstream(.*)$",text0,re.DOTALL)
if not m:
break
if len(m.group(3)) > 20:
streams = streams + m.group(2)[0:30] + "\n"
text0 = m.group(3)
zaehler = "000001"
ocr_complete = True
del_datei = []
while (0 == 0):
datei1 = "__xx_yy_zz__" + zaehler
os.system("pdftk " + datei + " cat " + zaehler + " output " + datei1 + ".pdf 2>&1")
if not os.path.isfile(datei1+".pdf"):
break
text = os.popen("pdftotext " + datei1 + ".pdf 2>&1").read()
print("DDDD",datei)
if "ocr" in datei:
text = ""
# print(glob.glob("./no_pdftotext*"))
if len(glob.glob("./no_pdftotext*")) > 0:
text = ""
if re.search(r"\* *\* *\* *O *C *R *S *C *A *N *\* *\* *\*",text):
print ("take page " + str(int(zaehler)))
del_datei.append(datei1+".pdf")
else:
print ("abbyyocr on page " + str(int(zaehler)))
if not os.path.isfile(datei1 + "_watermark.txt"):
os.system("abbyyocr " + abbyypar + " -ido -if " + datei1 + ".pdf -tet UTF8 -of " + datei1 + "_watermark.txt")
text = "\n* * * OCR SCAN * * *"+"\n" + open(datei1+"_watermark.txt").read() + "\n"
text = re.sub(r"ä","ae",text,99999999)
text = re.sub(r"ö","oe",text,99999999)
text = re.sub(r"ü","ue",text,99999999)
text = re.sub(r"Ä","Ae",text,99999999)
text = re.sub(r"Ö","Oe",text,99999999)
text = re.sub(r"Ü","Ue",text,99999999)
text = re.sub(r"ß","ss",text,99999999)
text = re.sub(r"\n","\n"*10000,text,99999999)
open(datei1+"_watermark.txt","w").write(text)
os.system("a2ps -B -1 -l 9999999 --borders=no -o " + datei1 + "_watermark.ps " + datei1 + "_watermark.txt")
os.system("ps2pdf " + datei1 + "_watermark.ps " + datei1 + "_watermark.pdf")
os.system("pdftk " + datei1 + ".pdf background " + datei1 + "_watermark.pdf output " + datei1 + "_ocr.pdf")
os.unlink(datei1 + "_watermark.txt")
os.unlink(datei1 + "_watermark.ps")
os.unlink(datei1 + "_watermark.pdf")
os.unlink(datei1 + ".pdf")
ocr_complete = False
del_datei.append(datei1+"_ocr.pdf")
zaehler = "%06u" % (int(zaehler) + 1)
if not ocr_complete:
os.system("pdftk " + " ".join(del_datei) + " cat output " + datei)
text = open(datei).read() # mark the additional streams, that they can be deleted again
if not streams == "":
streams = "STREAMS:" + "\n" + streams
open(datei,"w").write(text+streams)
for datei1 in del_datei:
os.unlink(datei1)
#****************************************************
def ocr_clear (self,pars):
datei = pars[0]
if not os.path.isfile(datei):
return()
text0 = open(datei).read()
m = re.search(r"^(.*%%EOF.*?)STREAMS:(.*)$",text0,re.DOTALL)
if not m:
return()
text0 = m.group(1)
streams = m.group(2).split("\n")
text1 = ""
while (0 == 0):
m = re.search(r"^(.*?\s)(stream\s*)(.*?)(\s+endstream)(.*)$",text0,re.DOTALL)
if not m:
text1 = text1 + text0
break
text1 = text1 + m.group(1) + m.group(2)
if m.group(3)[0:30] in streams or len(m.group(3)) < 20:
text1 = text1 + m.group(3)
text1 = text1 + m.group(4)
text0 = m.group(5)
open("__xx__yy__zz__ww__.pdf","w").write(text1)
os.system("pdftk __xx__yy__zz__ww__.pdf cat output " + datei)
os.unlink("__xx__yy__zz__ww__.pdf")
#****************************************************
def ocr_sync8 (self,pars):
self.ocr8 = True
self.abbyy = False
self.ocr_sync(pars)
#****************************************************
def ocr_sync (self,pars):
import hashlib
import base64
try:
self.ocr8
except:
self.ocr8 = False
pdffiles = {}
if not 'abbyy' in vars(self):
# print os.popen("abbyyocr --help 2>&1").read()
self.abbyy = re.search(r"FineReader Engine",os.popen("abbyyocr11 --help 2>&1").read())
if not self.abbyy:
self.abbyy = re.search(r"ABBYY CLI OCR 11 for Linux",os.popen("abbyyocr11 --help 2>&1").read())
dir = pars[0]
if os.path.isfile(dir + "/.gitignore"):
print("wwww")
text4 = open(dir+"/.gitignore").read()
if "__no_ocr__" in text4:
return()
if os.path.isfile(dir+"/ocr.par"):
ocr_par = re.sub(r"\n","",open(dir+"/ocr.par").readline(),99)
else:
ocr_par = "-adt -rl German -aeate " # -ascm (abbyyocr 8)
ocr_par = self.TESSERACT + " " # tesseract
files = {}
for item in os.listdir(dir):
if os.path.isdir(dir+"/"+item):
self.ocr_sync([dir+"/"+item])
else:
file = dir + "/" + item
# if "_orig" in file or "orig_" in file or "_ORIG" in file or "ORIG_" in file:
# try:
# os.unlink(file)
# except:
# pass
m = re.search(r"^(.*)\.ocr$",file)
if m:
m1 = None
try:
m1 = re.search(r"ID\: +([A-Za-z0-9\/\.\+-]+), PARAMETERS.?\: +(.*)",open(file).read())
except Exception as e:
print(item,e)
if m1:
files[m1.group(1)] = [file,m1.group(2).strip()]
for item in os.listdir(dir):
file = dir + "/" + item
# if not re.search(r"^[a-zA-Z0-9\.\–\_]+$",file):
# continue
print (file)
m = re.search(r"^(.*)\.pdf$",file)
if not re.search(r"^[\:\\\/a-zA-Z0-9\.\_-]+$",file):
continue
if "NOT_VALID" in file:
continue
if m:
froot = m.group(1)
try:
text7 = open(froot+".pdf",errors="replace").read()
except Exception as e:
print(file,e)
ispdf7 = re.search(r"1\.[012345678]",text7[0:20])
md5new = hashlib.md5(text7.encode(encoding="utf-8")).digest()
md5new = str(base64.b64encode(md5new))
md5new = re.sub(r"^(b\'|)(.*)\=\=\'?$","\\2",md5new)
if md5new in pdffiles:
pdffiles[md5new].append(froot+".pdf")
else:
pdffiles[md5new] = [froot+".pdf"]
if md5new in files:
print (files[md5new])
print (file)
print (md5new)
print (ocr_par)
print (files[md5new][1])
if files[md5new][1] in (ocr_par.strip(),"pdftotext","ps2ascii"):
if not (froot + ".ocr") == files[md5new][0]:
if os.path.isfile(files[md5new][0]):
os.rename(files[md5new][0],froot+".ocr")
continue
print (file)
if "_ocr" in froot:
os.system("cp " + froot + ".pdf " + froot + ".ocr")
ocr_par1 = "ocr"
elif len(glob.glob(dir+"/no_pdftotext*")) > 0:
os.system("cp " + froot + ".pdf " + froot + ".ocr")
ocr_par1 = "ocr"
elif len(glob.glob(dir+"/no_ocr*")) > 0:
print("continue")
continue
elif ispdf7:
os.system("pdftotext -layout "+froot+".pdf; mv " + froot + ".txt " + froot + ".ocr")
ocr_par1 = "pdftotext"
else:
os.system("cp " + froot + ".pdf " + froot + ".ocr")
ocr_par1 = "direct"
# os.system("ps2ascii "+froot+".pdf > "+froot+".ocr")
# ocr_par1 = "ps2ascii"
print (ocr_par1)
# ocr_par1 = "ocr"
# os.system("cp " + froot + ".pdf " + froot + ".ocr")
try:
ocr_text = open(froot+".ocr",errors="replace").read()
except:
ocr_text = ""
# print(ocr_text)
# print(ocr_par1)
# print(123)
if 0 == 0 or not ocr_text == "" and ((not re.search(r"[A-Za-z].*[A-Za-z].*[A-Za-z]",ocr_text,re.DOTALL)
and not ocr_par1 == "direct") or "PDF Splitter" in ocr_text or ocr_par1 == "ocr"):
# print(1111)
# if 0 == 0:
# print(887766)
if self.abbyy:
print (" ---> OCR")
if os.path.isfile(froot+".ocm"):
os.system("cp " + froot + ".ocm " + froot + ".ocr")
else:
# os.system("abbyyocr " + ocr_par + " -if "+froot+".pdf -rkl -of "+froot+".ocr")
if self.ocr8:
ocr_par2 = re.sub(r"-aeate","-ascm",ocr_par)
ocr_par2 = re.sub(r"-adt ","",ocr_par2)
os.system("abbyyocr " + ocr_par2 + " -if "+froot+
".pdf -rkl -tpb -tet UTF8 -of "+froot+".ocr")
else:
os.system("abbyyocr11 " + ocr_par + " -if "+froot+
".pdf -f TextUnicodeDefaults -tel -tpb -tet UTF8 -rkl -trl -of "+froot+".ocr")
ocr_par1 = ocr_par
print("->",ocr_par1)
else:
# print(456)
ocr_par1 = ocr_par
os.system("pdftoppm -jpeg -r 300 " + froot + ".pdf t_e_s_s_e_r_a_c_t")
single_files = glob.glob("t_e_s_s_e_r_a_c_t*jpg")
single_files.sort()
text7 = ""
for single_file in single_files:
if os.path.isfile("eng.eng"):
print("English")
os.system("tesseract " + single_file + " " + single_file + " --oem 0 -l eng --psm 11 pdf")
ocr_par1 = re.sub(r"deu","eng",self.TESSERACT)
else:
os.system("tesseract " + single_file + " " + single_file + " --oem 0 -l deu --psm 11 pdf")
ocr_par1 = self.TESSERACT
os.system("pdftotext -layout " + single_file + ".pdf")
# if not text7 == "":
# text7 = text7 + ""
os.unlink(single_file)
os.unlink(single_file+".pdf")
try:
text7 = text7 + open(single_file+".txt").read()
os.unlink(single_file+".txt")
except:
pass
open(froot+".ocr","w").write(text7)
# os.remove(froot+".ocr")
# return()
if self.ocr8:
o = "8"
else:
o = ""
o = ""
open(froot+".ocr","a").write("\n\n\nID: "+md5new+", PARAMETERS" + o + ": " + ocr_par1 + "\n")
for item in os.listdir(dir):
file = dir + "/" + item
m = re.search(r"^(.*)\.ocr$",file)
if m:
froot = m.group(1)
if not os.path.isfile(froot+".pdf"):
os.remove(file)
for md5key in pdffiles:
if len(pdffiles[md5key]) > 1:
print("doublette: --------------------------")
for file in pdffiles[md5key]:
print("doublette: " + file)
print("")
#****************************************************
def beep (self,intervals):
print ("beep")
for interval in intervals:
time.sleep(interval*0.001)
os.system("echo '\a'")
#****************************************************
def sms (self,pars):
import serial
phone = serial.Serial("/dev/ttyUSB0",460800,timeout=5)
# phone.write("ATE0\r")
# phone.write("AT S7=45 S0=0 L1 V1 X0 E1 Q0\r")
phone.write("ATE0X0V1S7=45S0=0Q0\r")
print (phone.readline())
print (phone.readline())
time.sleep(0.5)
# phone.write("at+chup")
# phone.readline()
# time.sleep(0.5)
# phone.write("at+CPMS=\"SM\"")
# time.sleep(0.5)
phone.write("at+cpin=\"2133\"")
print (phone.readline())
print (phone.readline())
time.sleep(0.5)
# $o = _mc($modem,$cr,"at+CPMS=\"SM\"");
# print $o;
# last if ($o !~ /ERROR/);
# print _mc($modem,$cr,"at+cpin=\"2121\"");
# }
## print _mc($modem,$cr,"at+cmee=1");
# print _mc($modem,$cr,"at+csca=\"01770610000\"");
## print _mc($modem,$cr,"at+cmgf=?");
# print _mc($modem,$cr,"at+cmgf=1");
#
# foreach $o (split(/,/,$nr)) {
# print _mc($modem,$cr,"at+cmgs=\"$o\"");
# print _mc($modem,"",$text);
# $modem->atsend(chr(26));
# print $text . "\n";
# $modem->atsend(chr(27));
# sleep 5;
# print _mc($modem,$cr,"at+chup");
# }
# print _mc($modem,$cr,"atz");
#
# return(1);
#****************************************************
def md_pptx(self,pars):
from pptx import Presentation
from pptx.util import Inches, Pt
# from pptx.dml.color import RGBColor
# from pptx.enum.dml import MSO_THEME_COLOR
from pptx.enum.text import PP_ALIGN
try:
def textbox(t1,t2,t3):
t_slide = prs.slides.add_slide(title_slide_layout)
t = t_slide.placeholders[0]
st = t_slide.placeholders[1]
j = True
while j:
if md_text[i+1].isspace() and i < len(md_text) - 1:
del md_text[i+1]
else:
j = False
if h_level == 1:
t1 = l.strip('\n')
if l.startswith('#'):
t1 = t1[h_level:]
t.text = t1
t_slide.placeholders[0].text_frame.paragraphs[0].font.size = Pt(44)
# t_slide.placeholders[0].text_frame.paragraphs[0].font.color.rgb = RGBColor(5,65,150)
latest_t = t1
pht = t1
elif h_level == 2:
t2 = l.strip('\n')
if l.startswith('#'):
t2 = t2[h_level:]
t.text = t1
t_slide.placeholders[0].text_frame.paragraphs[0].font.size = Pt(44)
# t_slide.placeholders[0].text_frame.paragraphs[0].font.color.rgb = RGBColor(5,65,150)
st.text = t2
latest_t = t2
pht = t2
elif h_level == 3:
t3 = l[h_level:].strip('\n')
t.text = t2
st.text = t3
latest_t = t3
pht = t3
return (t1,t2,t3,pht,latest_t)
def new_slide():
l_slide = prs.slides.add_slide(line_slide_layout)
l_slide.placeholders[0].text = pht
tf = l_slide.shapes[1].text_frame
tf.word_wrap = True
p = tf.paragraphs[0]
return(tf,p,l_slide)
def emphase(lc,p,r):
str_cont = True
while str_cont:
l_content = re.findall(r'\_\_\_.*?\_\_\_|\_\_.*?\_\_|\_.*?\_|\*\*\*.*?\*\*\*|\*\*.*?\*\*|\*.*?\*|\`.*?\`|\[.*?\]\(.*?\)', lc)
if l_content: # inline bold / italic / code / link conditions
l_part = lc.partition(l_content[0])
r.text = l_part[0]
lp2 = l_part[2]
r = p.add_run()
r.text = l_part[1]
if r.text.startswith('***') or r.text.startswith('___'):
r.font.bold = True
r.font.italic = True
r.text = r.text[3:-3]
elif r.text.startswith('**') or r.text.startswith('__'):
r.font.bold = True
r.text = r.text[2:-2]
elif r.text.startswith('*') or r.text.startswith('_'):
r.font.italic = True
r.text = r.text[1:-1]
elif r.text.startswith('`'):
r.font.name = 'Consolas'
r.font.size = Pt(15)
r.text = r.text[1:-1]
elif r.text.startswith('['):
r.text = r.text.split('(')[0][1:-1]
hlink = r.hyperlink
hlink.address = l_part[1].split('(')[1][0:-1]
lp2 =re.sub('\(.*?\)','', lp2)
r = p.add_run()
r.text = lp2
lc = r.text
else:
str_cont = False
if len(pars) < 2:
print ('\nmd_pptx: : missing parameter')
return
md_file = pars[0]
with open(md_file) as f:
md_text = f.readlines()
md_text.append('\n')
md_text = [l.expandtabs(4) for l in md_text]
if len(pars) == 3:
sf = pars[2]
prs = Presentation('template.pptx')
else:
prs = Presentation()
title_slide_layout = prs.slide_layouts[0]
line_slide_layout = prs.slide_layouts[1]
t1 = ''
t2 = ''
t3 = ''
md_list = False
md_code = False
new_head = False
md_tab = 0
t_coln = 0
t_cola = ''
t_rown = 1
t_cell = []
latest_t = ''
h_level = 0
p_level = 0
header = False
md_break = False
for i, l in enumerate(md_text):
if i < len(md_text) - 1 and (header or l.startswith('#') or re.match('^\=+\n',md_text[i+1]) or re.match('^-+\n',md_text[i+1])):
header = True
if l[0] == '#' and l[1] != '#' or re.match('^\=+\n',md_text[i+1]): # headers
if md_text[i+1].startswith('='):
md_text[i+1] = '\n'
h_level = 1
t1,t2,t3,pht,latest_t = textbox(t1,t2,t3)
new_head = True
if md_text[i+1].startswith('#') or re.match('^\=+\n|^-+\n',md_text[i+2]):
pass
else:
tf,p,l_slide = new_slide()
elif l[0:2] == '##' and l[2] != '#' or re.match('^-+\n',md_text[i+1]):
if md_text[i+1].startswith('-'):
md_text[i+1] = '\n'
h_level = 2
t1,t2,t3,pht,latest_t = textbox(t1,t2,t3)
new_head = True
if md_text[i+1].startswith('#') or re.match('^\=+\n|^-+\n',md_text[i+2]):
pass
else:
tf,p,l_slide = new_slide()
elif l[0:3] == '###' and l[3] != '#':
h_level = 3
t1,t2,t3,pht,latest_t = textbox(t1,t2,t3)
new_head = True
if md_text[i+1].startswith('#') or re.match('^\=+\n|^-+\n',md_text[i+2]):
pass
else:
tf,p,l_slide = new_slide()
elif not l.isspace() and l.startswith('![',0) and not md_list: # image
s = l.split(']')
m = re.search(r'.*\(.*?\)', s[1])
if m:
img_path = m.group(0)[1:-1]
l_slide.shapes[1].left = -10000000
top = Inches(1.7)
left = Inches(0.6)
height = Inches(4.9)
pic = l_slide.shapes.add_picture(img_path, left, top, height = height)
top = Inches(6.7)
width = Inches(8)
height = Inches(0.5)
txBox = l_slide.shapes.add_textbox(left, top, width, height)
tf = txBox.text_frame
m = re.search(r'.*\[.*?\]', l)
if m:
tf.text = m.group(0)[2:-1]
p = tf.paragraphs[0]
p.font.italic = True
elif not l.isspace() and md_tab == 1 and not md_list: # table
t_coln = l.count('|') + 1
cell = map(str.strip, l.split('|'))
t_cell = t_cell + cell
md_tab = 2
elif md_tab == 2:
t_cola = map(str.strip, l.split('|'))
i = 0
while i < t_coln:
if t_cola[i].startswith(':') and not t_cola[i].endswith(':'):
t_cola[i] = PP_ALIGN.LEFT
elif t_cola[i].endswith(':') and not t_cola[i].startswith(':'):
t_cola[i] = PP_ALIGN.RIGHT
elif t_cola[i].endswith(':') and t_cola[i].startswith(':'):
t_cola[i] = PP_ALIGN.CENTER
else:
t_cola[i] = None
i = i + 1
md_tab = 3
elif md_tab == 3 and l.count('|') == t_coln - 1:
t_rown = t_rown + 1
cell = map(str.strip, l.split('|'))
t_cell = t_cell + cell
if md_text[i+1].count('|') != t_coln - 1:
l_slide.shapes[1].left = -10000000
top = Inches(1.7)
left = Inches(0.6)
height = Inches(0.31 * t_rown)
width = Inches(8.8)
tab = l_slide.shapes.add_table(t_rown, t_coln, left, top, width, height).table
i = 0
row = 0
while i < len(t_cell):
tab.cell(row,i%t_coln).text = t_cell[i]
tab.cell(row,i%t_coln).text_frame.paragraphs[0].alignment = t_cola[i%t_coln]
if t_cell[i] != '':
p = tab.cell(row,i%t_coln).text_frame.paragraphs[0]
r = tab.cell(row,i%t_coln).text_frame.paragraphs[0].runs[0]
emphase(t_cell[i],p,r)
i= i + 1
if i%t_coln == 0:
row = row + 1
md_tab = 0
t_coln = 0
t_cola = ''
t_rown = 1
t_cell = []
elif not l.isspace() and re.match('^#{7,}\n', l): # add slide
l_slide = prs.slides.add_slide(line_slide_layout)
l_slide.shapes[0].text = latest_t
tf = l_slide.shapes[1].text_frame
tf.word_wrap = True
p = tf.paragraphs[0]
elif (md_text[i-1].isspace() or new_head) and l[:4].isspace() and not md_list or md_code: # code block
new_head = False
r = p.add_run()
r.font.name = 'Consolas'
r.font.size = Pt(15)
l = l.rstrip()
if not md_text[i+1].isspace():
l = l + '\n'
r.text = l[2:]
md_code = True
if md_text[i+1].isspace() or not md_text[i+1][:4].isspace():
md_code = False
elif not l.isspace(): # list
if p_level == 4:
p.level = 4
if re.match(' *?[-|\+|\*] ', l):
s = re.split('-|\+|\*', l, 1)
l = s[1]
elif p_level == 3:
p.level = 3
if re.match(' *?[-|\+|\*] ', l):
s = re.split('-|\+|\*', l, 1)
l = s[1]
elif p_level == 2:
p.level = 2
if re.match(' *?[-|\+|\*] ', l):
s = re.split('-|\+|\*', l, 1)
l = s[1]
if l.endswith(' \n'):
p_level = 5
# elif p_level == 1:
# p.level = 1
# s = re.split('^[0-9]+\.', l, 1)
# l = s[1]
elif p_level == 0:
p.level = 0
r = p.add_run()
if not l.endswith(' \n'): # line break condition
l = l.rstrip() + ' '
md_break = False
elif p.level > 0:
md_break = True
l = l.rstrip('\n')
l = re.sub(' +', ' ', l)
r.text = l
lc = l
emphase(lc,p,r)
if md_text[i+1].isspace() and not md_text[i].isspace():
if not md_text[i].startswith('!') and not re.match('^#{7,}\n', l):
p = tf.add_paragraph()
if not md_list:
p = tf.add_paragraph()
if md_text[i+1].count('|') != 0 and md_text[i+1].count('|') == md_text[i+2].count('|') and md_tab == 0 and re.match('^(?!\s)', md_text[i+1]):
md_tab = 1
# elif re.match(' {0,3}?^[0-9]+\. ', md_text[i+1]):
# if not new_head and not md_text[i].isspace():
# p = tf.add_paragraph()
# p_level = 1
# md_list = True
if re.match(' {0,3}?[-|\+|\*] ', md_text[i+1]):
if not new_head and not md_text[i].isspace():
p = tf.add_paragraph()
p_level = 2
md_list = True
elif re.match(' {4,7}?[-|\+|\*] ', md_text[i+1]) and md_list:
if not new_head and not md_text[i].isspace():
p = tf.add_paragraph()
p_level = 3
elif re.match(' {8,11}?[-|\+|\*] ', md_text[i+1]) and md_list:
if not new_head and not md_text[i].isspace():
p = tf.add_paragraph()
p_level = 4
if p_level == 5 and not md_text[i+1].isspace() and md_break:
p = tf.add_paragraph()
p.level = 5
if not md_text[i+1].isspace() and not md_text[i+1].startswith('![',0) and not md_text[i+1][:4].isspace():
new_head = False
if md_list and md_text[i].isspace():
s = md_text[i+1].lstrip()
if re.match('^(?!-|\+|\*).*', s) or md_text[i+1].isspace():
# if re.match('^(?!-|\+|\*|^[0-9]+\.).*', s) or md_text[i+1].isspace():
md_list = False
p_level = 0
p = tf.add_paragraph()
elif i < len(md_text) - 1 and (md_text[i+1].startswith('#') or re.match('^=+\n',md_text[i+2]) or re.match('^-+\n',md_text[i+2])):
header = True
prs.save(pars[1])
except Exception as inst:
print ('\nmd_pptx: ' + str(inst))
#****************************************************
def docx_ins(self,pars):
"""
This module enables manipulations in an existing docx file via a command line.
prerequisite:
-------------
pip install python-docx
Placeholder:
------------
A text string can be inserted at a placeholder within an arbitrary text passage of a docx file.
The placeholder can be entered in a markdown file, which is converted then to a docx file,
or can be entered in a docx file.
It can also be an already existing text in the file. There can be any number of placeholders.
The syntax of a placeholder is {<placeholder text>}. Formatting the placeholder, e.g. bold,
affects the insertion.
Command:
--------
yc docx_ins <docx file> <placeholder text> "<text string>"
to insert <text string> at the placeholder.
Before the first manipulation of <docx file> it is saved as <docx file>_save
"""
from docx import Document
try:
if len(pars) < 3:
print ('\ninsert_docx: missing parameter')
return
w_file = pars[0]
document = Document(w_file)
if not os.path.isfile(w_file + '_save'):
document.save(w_file + '_save')
ins_mark = pars[1].decode(sys.getfilesystemencoding())
new_value = pars[2].decode(sys.getfilesystemencoding())
"""
If the placeholder is edited in the docx file, more than one run objects may be created.
In this case, the split string has to be gathered in only one run object.
"""
for p in document.paragraphs:
more_runs = False
for i, r in enumerate(p.runs):
if more_runs:
if r.text.find('}', 0) != -1 and not '{' in r.text[0:r.text.find('}', 0)]:
more_runs = False
r.text = p.runs[i-1].text + r.text
p.runs[i-1].clear()
if '{' + ins_mark + '}' in r.text:
r.text = r.text.replace('{' + ins_mark + '}', new_value,1)
document.save(w_file)
return
if r.text.rfind('{', 0) != -1 and not '}' in r.text[r.text.rfind('{', 0):]:
more_runs = True
except Exception as inst:
if 'Package not found at' in '%s' % (inst.args[0]):
print ('\ninsert_docx: nonexistent docx file')
elif inst.args[0] == 13:
print ('\ninsert_docx: open docx file')
else:
print ('\ninsert_docx: ' + str(inst))
#****************************************************
def yt (self,pars):
ytxx = pars[-1]
# print('youtube-dl https://youtube.com/results?search_query=\"' + "+".join(pars[:-1]) + '\" | tee ytxx')
# os.system('youtube-dl -e --get-id https://youtube.com/results?search_query=\"' + "+".join(pars[:-1]) + '\" | tee ytxx')
print('youtube-dl -e --get-id ytsearchall:' + "+".join(pars[:-1]) + ' | tee ytxx')
os.system('youtube-dl -e --get-id ytsearchall:' + "+".join(pars[:-1]) + ' | tee ytxx')
time.sleep(1)
line1 = ""
text = ""
for line in open("ytxx").read().split("\n"):
if line1 == "":
line1 = line
else:
text = text + "# youtube-dl " + line + " # " + line1 + "\n"
line1 = ""
open(ytxx,"a").write("\n#=========================\n# " +
" ".join(pars[:-1]) +
"\n#=========================\n\n" + text)
os.system("rm ytxx")
# os.system("joe " + ytxx)
#****************************************************
if __name__ == "__main__":
print (sys.argv)
Util.__dict__[sys.argv[1]](Util(),sys.argv[2:])