(window.webpackJsonp=window.webpackJsonp||[]).push([[2756],{3164:function(t,s,a){"use strict";a.r(s);var n=a(31),e=Object(n.a)({},(function(){var t=this,s=t.$createElement,a=t._self._c||s;return a("ContentSlotsDistributor",{attrs:{"slot-key":t.$parent.slotKey}},[a("h1",{attrs:{id:"optical-character-recognition"}},[a("a",{staticClass:"header-anchor",attrs:{href:"#optical-character-recognition"}},[t._v("#")]),t._v(" Optical Character Recognition")]),t._v(" "),a("p",[t._v("Optical Character Recognition is converting images of text into actual text. In these examples find ways of using OCR in python.")]),t._v(" "),a("h2",{attrs:{id:"pytesseract"}},[a("a",{staticClass:"header-anchor",attrs:{href:"#pytesseract"}},[t._v("#")]),t._v(" PyTesseract")]),t._v(" "),a("p",[t._v("PyTesseract is an in-development python package for OCR.")]),t._v(" "),a("p",[t._v("Using PyTesseract is pretty easy:")]),t._v(" "),a("div",{staticClass:"language-py extra-class"},[a("pre",{pre:!0,attrs:{class:"language-py"}},[a("code",[a("span",{pre:!0,attrs:{class:"token keyword"}},[t._v("try")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(":")]),t._v("\n "),a("span",{pre:!0,attrs:{class:"token keyword"}},[t._v("import")]),t._v(" Image\n"),a("span",{pre:!0,attrs:{class:"token keyword"}},[t._v("except")]),t._v(" ImportError"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(":")]),t._v("\n "),a("span",{pre:!0,attrs:{class:"token keyword"}},[t._v("from")]),t._v(" PIL "),a("span",{pre:!0,attrs:{class:"token keyword"}},[t._v("import")]),t._v(" Image\n\n"),a("span",{pre:!0,attrs:{class:"token keyword"}},[t._v("import")]),t._v(" pytesseract\n\n "),a("span",{pre:!0,attrs:{class:"token comment"}},[t._v("#Basic OCR")]),t._v("\n "),a("span",{pre:!0,attrs:{class:"token keyword"}},[t._v("print")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("(")]),t._v("pytesseract"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(".")]),t._v("image_to_string"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("(")]),t._v("Image"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(".")]),a("span",{pre:!0,attrs:{class:"token builtin"}},[t._v("open")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("(")]),a("span",{pre:!0,attrs:{class:"token string"}},[t._v("'test.png'")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(")")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(")")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(")")]),t._v("\n\n "),a("span",{pre:!0,attrs:{class:"token comment"}},[t._v("#In French")]),t._v("\n "),a("span",{pre:!0,attrs:{class:"token keyword"}},[t._v("print")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("(")]),t._v("pytesseract"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(".")]),t._v("image_to_string"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("(")]),t._v("Image"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(".")]),a("span",{pre:!0,attrs:{class:"token builtin"}},[t._v("open")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("(")]),a("span",{pre:!0,attrs:{class:"token string"}},[t._v("'test-european.jpg'")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(")")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(",")]),t._v(" lang"),a("span",{pre:!0,attrs:{class:"token operator"}},[t._v("=")]),t._v("'fra’"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(")")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(")")]),t._v("\n\n")])])]),a("p",[t._v("PyTesseract is open source and can be found "),a("a",{attrs:{href:"https://github.com/madmaze/pytesseract",target:"_blank",rel:"noopener noreferrer"}},[t._v("here"),a("OutboundLink")],1),t._v(".")]),t._v(" "),a("h2",{attrs:{id:"pyocr"}},[a("a",{staticClass:"header-anchor",attrs:{href:"#pyocr"}},[t._v("#")]),t._v(" PyOCR")]),t._v(" "),a("p",[t._v("Another module of some use is "),a("code",[t._v("PyOCR")]),t._v(", source code of which is "),a("a",{attrs:{href:"https://github.com/jflesch/pyocr",target:"_blank",rel:"noopener noreferrer"}},[t._v("here"),a("OutboundLink")],1),t._v(".")]),t._v(" "),a("p",[t._v("Also simple to use and has more features than "),a("code",[t._v("PyTesseract")]),t._v(".")]),t._v(" "),a("p",[t._v("To initialize:")]),t._v(" "),a("div",{staticClass:"language-py extra-class"},[a("pre",{pre:!0,attrs:{class:"language-py"}},[a("code",[a("span",{pre:!0,attrs:{class:"token keyword"}},[t._v("from")]),t._v(" PIL "),a("span",{pre:!0,attrs:{class:"token keyword"}},[t._v("import")]),t._v(" Image\n"),a("span",{pre:!0,attrs:{class:"token keyword"}},[t._v("import")]),t._v(" sys\n\n"),a("span",{pre:!0,attrs:{class:"token keyword"}},[t._v("import")]),t._v(" pyocr\n"),a("span",{pre:!0,attrs:{class:"token keyword"}},[t._v("import")]),t._v(" pyocr"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(".")]),t._v("builders\n\ntools "),a("span",{pre:!0,attrs:{class:"token operator"}},[t._v("=")]),t._v(" pyocr"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(".")]),t._v("get_available_tools"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("(")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(")")]),t._v("\n"),a("span",{pre:!0,attrs:{class:"token comment"}},[t._v("# The tools are returned in the recommended order of usage")]),t._v("\ntool "),a("span",{pre:!0,attrs:{class:"token operator"}},[t._v("=")]),t._v(" tools"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("[")]),a("span",{pre:!0,attrs:{class:"token number"}},[t._v("0")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("]")]),t._v("\n\nlangs "),a("span",{pre:!0,attrs:{class:"token operator"}},[t._v("=")]),t._v(" tool"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(".")]),t._v("get_available_languages"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("(")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(")")]),t._v("\nlang "),a("span",{pre:!0,attrs:{class:"token operator"}},[t._v("=")]),t._v(" langs"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("[")]),a("span",{pre:!0,attrs:{class:"token number"}},[t._v("0")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("]")]),t._v("\n"),a("span",{pre:!0,attrs:{class:"token comment"}},[t._v("# Note that languages are NOT sorted in any way. Please refer")]),t._v("\n"),a("span",{pre:!0,attrs:{class:"token comment"}},[t._v("# to the system locale settings for the default language")]),t._v("\n"),a("span",{pre:!0,attrs:{class:"token comment"}},[t._v("# to use.")]),t._v("\n\n")])])]),a("p",[t._v("And some examples of usage:")]),t._v(" "),a("div",{staticClass:"language-py extra-class"},[a("pre",{pre:!0,attrs:{class:"language-py"}},[a("code",[t._v("txt "),a("span",{pre:!0,attrs:{class:"token operator"}},[t._v("=")]),t._v(" tool"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(".")]),t._v("image_to_string"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("(")]),t._v("\n Image"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(".")]),a("span",{pre:!0,attrs:{class:"token builtin"}},[t._v("open")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("(")]),a("span",{pre:!0,attrs:{class:"token string"}},[t._v("'test.png'")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(")")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(",")]),t._v("\n lang"),a("span",{pre:!0,attrs:{class:"token operator"}},[t._v("=")]),t._v("lang"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(",")]),t._v("\n builder"),a("span",{pre:!0,attrs:{class:"token operator"}},[t._v("=")]),t._v("pyocr"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(".")]),t._v("builders"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(".")]),t._v("TextBuilder"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("(")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(")")]),t._v("\n"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(")")]),t._v("\n"),a("span",{pre:!0,attrs:{class:"token comment"}},[t._v("# txt is a Python string")]),t._v("\n\nword_boxes "),a("span",{pre:!0,attrs:{class:"token operator"}},[t._v("=")]),t._v(" tool"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(".")]),t._v("image_to_string"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("(")]),t._v("\n Image"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(".")]),a("span",{pre:!0,attrs:{class:"token builtin"}},[t._v("open")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("(")]),a("span",{pre:!0,attrs:{class:"token string"}},[t._v("'test.png'")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(")")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(",")]),t._v("\n lang"),a("span",{pre:!0,attrs:{class:"token operator"}},[t._v("=")]),a("span",{pre:!0,attrs:{class:"token string"}},[t._v('"eng"')]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(",")]),t._v("\n builder"),a("span",{pre:!0,attrs:{class:"token operator"}},[t._v("=")]),t._v("pyocr"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(".")]),t._v("builders"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(".")]),t._v("WordBoxBuilder"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("(")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(")")]),t._v("\n"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(")")]),t._v("\n"),a("span",{pre:!0,attrs:{class:"token comment"}},[t._v("# list of box objects. For each box object:")]),t._v("\n"),a("span",{pre:!0,attrs:{class:"token comment"}},[t._v("# box.content is the word in the box")]),t._v("\n"),a("span",{pre:!0,attrs:{class:"token comment"}},[t._v("# box.position is its position on the page (in pixels)")]),t._v("\n"),a("span",{pre:!0,attrs:{class:"token comment"}},[t._v("#")]),t._v("\n"),a("span",{pre:!0,attrs:{class:"token comment"}},[t._v("# Beware that some OCR tools (Tesseract for instance)")]),t._v("\n"),a("span",{pre:!0,attrs:{class:"token comment"}},[t._v("# may return empty boxes")]),t._v("\n\nline_and_word_boxes "),a("span",{pre:!0,attrs:{class:"token operator"}},[t._v("=")]),t._v(" tool"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(".")]),t._v("image_to_string"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("(")]),t._v("\n Image"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(".")]),a("span",{pre:!0,attrs:{class:"token builtin"}},[t._v("open")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("(")]),a("span",{pre:!0,attrs:{class:"token string"}},[t._v("'test.png'")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(")")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(",")]),t._v(" lang"),a("span",{pre:!0,attrs:{class:"token operator"}},[t._v("=")]),a("span",{pre:!0,attrs:{class:"token string"}},[t._v('"fra"')]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(",")]),t._v("\n builder"),a("span",{pre:!0,attrs:{class:"token operator"}},[t._v("=")]),t._v("pyocr"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(".")]),t._v("builders"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(".")]),t._v("LineBoxBuilder"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("(")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(")")]),t._v("\n"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(")")]),t._v("\n"),a("span",{pre:!0,attrs:{class:"token comment"}},[t._v("# list of line objects. For each line object:")]),t._v("\n"),a("span",{pre:!0,attrs:{class:"token comment"}},[t._v("# line.word_boxes is a list of word boxes (the individual words in the line)")]),t._v("\n"),a("span",{pre:!0,attrs:{class:"token comment"}},[t._v("# line.content is the whole text of the line")]),t._v("\n"),a("span",{pre:!0,attrs:{class:"token comment"}},[t._v("# line.position is the position of the whole line on the page (in pixels)")]),t._v("\n"),a("span",{pre:!0,attrs:{class:"token comment"}},[t._v("#")]),t._v("\n"),a("span",{pre:!0,attrs:{class:"token comment"}},[t._v("# Beware that some OCR tools (Tesseract for instance)")]),t._v("\n"),a("span",{pre:!0,attrs:{class:"token comment"}},[t._v("# may return empty boxes")]),t._v("\n\n"),a("span",{pre:!0,attrs:{class:"token comment"}},[t._v("# Digits - Only Tesseract (not 'libtesseract' yet !)")]),t._v("\ndigits "),a("span",{pre:!0,attrs:{class:"token operator"}},[t._v("=")]),t._v(" tool"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(".")]),t._v("image_to_string"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("(")]),t._v("\n Image"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(".")]),a("span",{pre:!0,attrs:{class:"token builtin"}},[t._v("open")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("(")]),a("span",{pre:!0,attrs:{class:"token string"}},[t._v("'test-digits.png'")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(")")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(",")]),t._v("\n lang"),a("span",{pre:!0,attrs:{class:"token operator"}},[t._v("=")]),t._v("lang"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(",")]),t._v("\n builder"),a("span",{pre:!0,attrs:{class:"token operator"}},[t._v("=")]),t._v("pyocr"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(".")]),t._v("tesseract"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(".")]),t._v("DigitBuilder"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v("(")]),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(")")]),t._v("\n"),a("span",{pre:!0,attrs:{class:"token punctuation"}},[t._v(")")]),t._v("\n"),a("span",{pre:!0,attrs:{class:"token comment"}},[t._v("# digits is a python string")]),t._v("\n\n")])])])])}),[],!1,null,null,null);s.default=e.exports}}]);