Customizable Postprocessing

Last updated: February 27th, 20202020-02-27Project preview
In [2]:
!pip install numpy
!pip install pandas
Requirement already satisfied: numpy in /usr/local/lib/python3.6/site-packages (1.18.1)
WARNING: You are using pip version 19.1.1, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
Requirement already satisfied: pandas in /usr/local/lib/python3.6/site-packages (1.0.1)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/site-packages (from pandas) (2019.3)
Requirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/site-packages (from pandas) (2.8.0)
Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.6/site-packages (from pandas) (1.18.1)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/site-packages (from python-dateutil>=2.6.1->pandas) (1.12.0)
WARNING: You are using pip version 19.1.1, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
In [2]:
import json
import numpy as np
import pandas
import datetime

from utils import pprint
from record import RecordEntry, MetaData, DataRecord
In [3]:
# Example: Record entry
name = RecordEntry(value="Max Mustermann", ocr="Max Mosterman", ocr_pos=(54, 23, 10, 100), txt_pos=(250, 260), probability=0.95)
pprint(name)
name2 = RecordEntry(value="Mandy Mustermann", ocr="Mandy Mustermann", position=(54, 23, 20, 100), probability=0.93)
pprint(name2)
street = RecordEntry(value="Musterstraße 84", ocr="Musterstr. b4", position=(67, 138, 11, 90), txt_pos=(1000, 1020), probability=0.9)
pprint(street)
city = RecordEntry(value="Musterhausen", ocr="Musterhaus", position=(288, 417, 9, 110), probability=0.99)
pprint(city)
┐
├─ value : Max Mustermann
├─ ocr : Max Mosterman
├─ ocr_pos : (54, 23, 10, 100)
├─ txt_pos : (250, 260)
└─ probability : 0.95

┐
├─ value : Mandy Mustermann
├─ ocr : Mandy Mustermann
├─ ocr_pos : (nan, nan, nan, nan)
├─ txt_pos : (nan, nan)
├─ probability : 0.93
└─ position : (54, 23, 20, 100)

┐
├─ value : Musterstraße 84
├─ ocr : Musterstr. b4
├─ ocr_pos : (nan, nan, nan, nan)
├─ txt_pos : (1000, 1020)
├─ probability : 0.9
└─ position : (67, 138, 11, 90)

┐
├─ value : Musterhausen
├─ ocr : Musterhaus
├─ ocr_pos : (nan, nan, nan, nan)
├─ txt_pos : (nan, nan)
├─ probability : 0.99
└─ position : (288, 417, 9, 110)

In [4]:
# Example: DataRecord

# Define meta data
doc_meta = MetaData(document="my_invoice.png",
                    user="MKrause",
                    release="1.2.3")

address = DataRecord(street=[street], city=[city])

sender = DataRecord(name=[name, name2], address=[address])

# Define a single doc data record
single_record = DataRecord(meta=doc_meta, sender=[sender])
In [5]:
pprint(single_record)
┐
├─ meta ──┐
│         ├─ user : MKrause
│         ├─ release : 1.2.3
│         └─ document : my_invoice.png
│         
└─ sender ┐
          └─┐
            ├─ name ───┐
            │          ├─┐
            │          │ ├─ value : Max Mustermann
            │          │ ├─ ocr : Max Mosterman
            │          │ ├─ ocr_pos : (54, 23, 10, 100)
            │          │ ├─ txt_pos : (250, 260)
            │          │ └─ probability : 0.95
            │          │ 
            │          └─┐
            │            ├─ value : Mandy Mustermann
            │            ├─ ocr : Mandy Mustermann
            │            ├─ ocr_pos : (nan, nan, nan, nan)
            │            ├─ txt_pos : (nan, nan)
            │            ├─ probability : 0.93
            │            └─ position : (54, 23, 20, 100)
            │            
            │          
            └─ address ┐
                       └─┐
                         ├─ street ┐
                         │         └─┐
                         │           ├─ value : Musterstraße 84
                         │           ├─ ocr : Musterstr. b4
                         │           ├─ ocr_pos : (nan, nan, nan, nan)
                         │           ├─ txt_pos : (1000, 1020)
                         │           ├─ probability : 0.9
                         │           └─ position : (67, 138, 11, 90)
                         │           
                         │         
                         └─ city ──┐
                                   └─┐
                                     ├─ value : Musterhausen
                                     ├─ ocr : Musterhaus
                                     ├─ ocr_pos : (nan, nan, nan, nan)
                                     ├─ txt_pos : (nan, nan)
                                     ├─ probability : 0.99
                                     └─ position : (288, 417, 9, 110)
                                     
                                   
                         
                       
            
          

In [6]:
json.JSONEncoder().encode(single_record)
Out[6]:
'{"meta": {"user": "MKrause", "release": "1.2.3", "document": "my_invoice.png"}, "sender": [{"name": [{"value": "Max Mustermann", "ocr": "Max Mosterman", "ocr_pos": [54, 23, 10, 100], "txt_pos": [250, 260], "probability": 0.95}, {"value": "Mandy Mustermann", "ocr": "Mandy Mustermann", "ocr_pos": [NaN, NaN, NaN, NaN], "txt_pos": [NaN, NaN], "probability": 0.93, "position": [54, 23, 20, 100]}], "address": [{"street": [{"value": "Musterstra\\u00dfe 84", "ocr": "Musterstr. b4", "ocr_pos": [NaN, NaN, NaN, NaN], "txt_pos": [1000, 1020], "probability": 0.9, "position": [67, 138, 11, 90]}], "city": [{"value": "Musterhausen", "ocr": "Musterhaus", "ocr_pos": [NaN, NaN, NaN, NaN], "txt_pos": [NaN, NaN], "probability": 0.99, "position": [288, 417, 9, 110]}]}]}]}'
Notebooks AI
Notebooks AI Profile20060