|
| 1 | +import pytesseract |
| 2 | +import cv2 |
| 3 | +import sys |
| 4 | +import math |
| 5 | +import numpy as np |
| 6 | +from datetime import datetime,timedelta |
| 7 | +import os |
| 8 | +from PIL import Image |
| 9 | +from os import listdir |
| 10 | +from os.path import isfile, join |
| 11 | +import re |
| 12 | +import pandas as pd |
| 13 | + |
| 14 | + |
| 15 | +base_dir ="/mnt/" |
| 16 | +#base_dir ="" #Uncomment to run locally |
| 17 | + |
| 18 | +def ocr(file,lang,option,d): |
| 19 | + # Define config parameters. |
| 20 | + # '--oem 1' for using LSTM OCR Engine |
| 21 | + config = ('-l '+lang+' --oem 1 --psm 3') |
| 22 | + if option == 1: |
| 23 | + # Read image from disk |
| 24 | + im = cv2.imread(file, cv2.IMREAD_COLOR) |
| 25 | + else : |
| 26 | + im = file |
| 27 | + |
| 28 | + if d == 1: |
| 29 | + # Without denoising (Ticker only) |
| 30 | + temp = im |
| 31 | + temp = cv2.bitwise_not(temp) |
| 32 | + temp = cv2.resize(temp, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC) |
| 33 | + thresh = 127 |
| 34 | + temp = cv2.threshold(temp, thresh, 255, cv2.THRESH_BINARY)[1] |
| 35 | + temp = cv2.threshold(temp, 0, 255, cv2.THRESH_BINARY_INV)[1] |
| 36 | + con = pytesseract.image_to_data(temp, output_type='data.frame') |
| 37 | + con = con[con.conf != -1] |
| 38 | + con = con.groupby(['block_num'])['conf'].mean() |
| 39 | + text = pytesseract.image_to_string(temp, config=config) |
| 40 | + else: |
| 41 | + #With denoising (Invertion) |
| 42 | + temp = im |
| 43 | + temp = cv2.fastNlMeansDenoisingColored(temp,None,20,10,7,21) |
| 44 | + temp = cv2.fastNlMeansDenoising(temp,None,10,7,21) |
| 45 | + temp = cv2.bitwise_not(temp) |
| 46 | + temp = cv2.resize(temp, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC) |
| 47 | + thresh = 127 |
| 48 | + temp = cv2.threshold(temp, thresh, 255, cv2.THRESH_BINARY)[1] |
| 49 | + #temp = cv2.threshold(temp, 0, 255, cv2.THRESH_BINARY_INV)[1] |
| 50 | + con = pytesseract.image_to_data(temp, output_type='data.frame') |
| 51 | + con = con[con.conf != -1] |
| 52 | + con = con.groupby(['block_num'])['conf'].mean() |
| 53 | + text = pytesseract.image_to_string(temp, config=config) |
| 54 | + |
| 55 | + #With denoising (Without invertion) |
| 56 | + temp1 =im |
| 57 | + temp1 = cv2.fastNlMeansDenoisingColored(temp1,None,20,10,7,21) |
| 58 | + temp1 = cv2.fastNlMeansDenoising(temp1,None,10,7,21) |
| 59 | + temp1 = cv2.bitwise_not(temp1) |
| 60 | + temp1 = cv2.resize(temp1, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC) |
| 61 | + thresh = 127 |
| 62 | + if lang != 'ben': |
| 63 | + temp1 = cv2.threshold(temp1, thresh, 255, cv2.THRESH_BINARY)[1] |
| 64 | + temp1 = cv2.threshold(temp1, 0, 255, cv2.THRESH_BINARY_INV)[1] |
| 65 | + con1 = pytesseract.image_to_data(temp1, output_type='data.frame') |
| 66 | + con1 = con1[con1.conf != -1] |
| 67 | + con1 = con1.groupby(['block_num'])['conf'].mean() |
| 68 | + text1 = pytesseract.image_to_string(temp1, config=config) |
| 69 | + |
| 70 | + # Test conditions |
| 71 | + f=0 |
| 72 | + if con.empty and text != '' and con1.empty and text1 != '': |
| 73 | + return (text,con) |
| 74 | + if con.empty and con1.empty: |
| 75 | + if text1 != '': |
| 76 | + return (text1,con1) |
| 77 | + else: return (text,con) |
| 78 | + elif con1.empty and text !='': |
| 79 | + con1 =con |
| 80 | + return (text,con) |
| 81 | + elif con.empty and text1 !='': |
| 82 | + con =con1 |
| 83 | + return (text1,con1) |
| 84 | + if con[1] > con1[1]: |
| 85 | + text = text |
| 86 | + elif con1[1] >con[1]: |
| 87 | + text = text1 |
| 88 | + con = con1 |
| 89 | + return(text,con) |
| 90 | + |
| 91 | + |
| 92 | +# Write to output file |
| 93 | + |
| 94 | +def writefile(op,boxes,no,ms,base,text,lang): |
| 95 | + start = base+timedelta(milliseconds=ms) |
| 96 | + end = end = start + timedelta(milliseconds = 2200) |
| 97 | + st = int(''.join(re.findall('\d',str(start))))/1000000 |
| 98 | + en = int(''.join(re.findall('\d',str(end))))/1000000 |
| 99 | + |
| 100 | + # Modify ticker text for continuity |
| 101 | + splitted = text.split() |
| 102 | + if len(splitted)>2: |
| 103 | + if re.findall('[A-Za-z0-9]',splitted[0]): |
| 104 | + if len(splitted[0])<6: #English |
| 105 | + splitted = splitted[1:] |
| 106 | + elif len(splitted[0])<=8: #General |
| 107 | + splitted = splitted[1:] |
| 108 | + #Eliminate last word |
| 109 | + if re.findall('[A-Za-z0-9]',splitted[-1]): |
| 110 | + if len(splitted[-1])<=4: #English |
| 111 | + splitted = splitted[:-1] |
| 112 | + elif len(splitted[-1])<=8: #Hindi/Bengali |
| 113 | + splitted = splitted[:-1] |
| 114 | + text = ' '.join(splitted) |
| 115 | + |
| 116 | + # Write output to file |
| 117 | + op.write(str("%.3f"%round(st,3)) +'|'+str("%.3f"%round(en,3))+'|TIC2|'+str("%06d" %no)+'|'+\ |
| 118 | + str("%03d" %int(boxes[0]))+' '+str("%03d" %int(boxes[2]))+' '+str("%03d" %abs(boxes[1]-boxes[0]))+' '+str("%03d" %abs(boxes[3]-boxes[2]))+'|') |
| 119 | + op.write(text.replace('\n',' ').replace('\r',' ')+'\n') |
| 120 | + |
| 121 | + |
| 122 | +## fetch_output(file,boxes,frame_no,timestamp,start_time_utc,lang) |
| 123 | + |
| 124 | +def ocr_ticker(op,boxes,no,ts,base,lang): |
| 125 | + text='' |
| 126 | + try: |
| 127 | + text,con = ocr(base_dir+'tickimg.jpg',lang,1,1) |
| 128 | + if "".join(text.split()) == '': |
| 129 | + raise Exception('blank') |
| 130 | + writefile(op,boxes,no,ts,base,text,lang) |
| 131 | + os.remove(base_dir+'tickimg.jpg') |
| 132 | + os.remove(base_dir+'backup.jpg') |
| 133 | + except: |
| 134 | + #Execute backup if tickimg is blank or exception |
| 135 | + try: |
| 136 | + text,con =ocr(base_dir+'backup.jpg',lang,1,1) |
| 137 | + if text != '': |
| 138 | + writefile(op,boxes,no,ts,base,text,lang) |
| 139 | + os.remove(base_dir+'tickimg.jpg') |
| 140 | + os.remove(base_dir+'backup.jpg') |
| 141 | + except Exception as err: |
| 142 | + return |
| 143 | + ''' ERROR |
| 144 | + er = open(base_dir+'outputs/output1.txt',"a") |
| 145 | + er.write(str(no)+str(err)) |
| 146 | + er.write('\n') |
| 147 | + er.close()''' |
0 commit comments