Confusion Matrix Evaluator

Last updated: February 18th, 20202020-02-18Project preview

Confusion Matrix Evaluator

In [1]:
import os
import pandas
import datetime
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix

from analysis import show_value_counts, show_stacked_histogram
from confusion import ConfusionMatrix

DATA = r"data"
RESULTS = r"results"

Parameters

In [2]:
# Source file in directory defined by DATA
file = "Classification.csv"

# Clean data
clean = True

# Output result to directory indicated by timestamp
timestamp_output = True

Load data, prepare output directory

In [3]:
# Load file and check for correctness
if file.endswith("csv"):
    data = pandas.read_csv(os.path.join(DATA, file))
elif file.endswith(["xls", "xlsx"]):
    data = pandas.read_excel(os.path.join(DATA, file))
else:
    try:
        data = pandas.read_table(os.path.join(DATA, file))
    except:
        raise IOError(f"Cannot read file {os.path.join(DATA, file)} into pandas.DataFrame!")
        
# Prepare output directory
if timestamp_output:
    dt = datetime.datetime.now()
    dt = dt.strftime("%y-%d-%m_%H-%M-%S")
    outdir = os.path.join(RESULTS, dt)
else:
    outdir = RESULTS
if not os.path.exists(outdir):
    os.mkdir(outdir)

print(f"OUTPUTDIR: {outdir}")
OUTPUTDIR: results/20-18-02_08-15-36

Data cleaning (optional)

In [4]:
if clean:
    # Make string entries
    for column in data:
        data[column] = data[column].astype(str)

Data statistics

In [5]:
# Values occuring in individual columns
show_value_counts(data)

# Histogram
fig = show_stacked_histogram(data)
fig.savefig(os.path.join(outdir, "stacked_hist.png"))
Value counts: Observation
---------------------------
Mitgliedsbescheinigung    73
Vollmacht                 46
Urkunde                   39
Verdienstbescheinigung    20
Ausweis                   17
Meldebescheinigung         4
Name: Observation, dtype: int64

Value counts: Ground Truth
---------------------------
Mitgliedsbescheinigung    64
Vollmacht                 43
Urkunde                   42
Verdienstbescheinigung    24
Ausweis                   13
Meldebescheinigung         9
Studienbescheinigung       4
Name: Ground Truth, dtype: int64

Confusion matrix

In [6]:
# Get All-vs-all confusion matrix
C = ConfusionMatrix.get_ava(data)
C.save(os.path.join(outdir, "-".join([file.split(".")[0], "AVA"]) + ".csv"))
C
Out[6]:
Ausweis Meldebescheinigung Mitgliedsbescheinigung Studienbescheinigung Urkunde Verdienstbescheinigung Vollmacht
Observation Ausweis 12 0 1 0 0 0 0
Meldebescheinigung 1 4 1 0 0 2 1
Mitgliedsbescheinigung 1 0 56 0 0 3 4
Studienbescheinigung 0 0 2 0 0 0 2
Urkunde 1 0 2 0 39 0 0
Verdienstbescheinigung 1 0 8 0 0 13 2
Vollmacht 1 0 3 0 0 2 37
In [7]:
# Get One-vs-Rest confusion matrix
C = ConfusionMatrix.get_ovr(data)
C.save(os.path.join(outdir, "-".join([file.split(".")[0], "OVR"]) + ".csv"))
C
Out[7]:
Ausweis Meldebescheinigung Mitgliedsbescheinigung Studienbescheinigung Urkunde Verdienstbescheinigung Vollmacht
True False True False True False True False True False True False True False
Observation True 12 1 4 5 56 8 0 4 39 3 13 11 37 6
False 5 181 0 190 17 118 0 195 0 157 7 168 9 147
In [ ]:
 
Notebooks AI
Notebooks AI Profile20060