# Machine Learning for Biology

material for the MLB course

Last updated: August 19th, 2020

# Reduction dimension & visualization¶

## Digits images¶

Modules importation

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from urllib.request import urlopen
from sklearn.decomposition import PCA
from sklearn import manifold
from time import time


Data importation

In [2]:
url="https://perso.univ-rennes1.fr/valerie.monbet/doc/cours/digits_extrait_images.csv"

url="https://perso.univ-rennes1.fr/valerie.monbet/doc/cours/digits_extrait_labels.csv"

K = int(np.max(labels)+1)
print("Number of different labels :", K)

X = np.copy(digits)
n = digits.shape[0]
d = digits.shape[1]

Xs = digits/255. # standardization

Number of different labels : 5


## Principal component analysis¶

First run PCA and plot of the individuals with a color by label

In [ ]:


In [4]:
pca = PCA(n_components=2)
D = pca.fit_transform(Xs)
n_ex = 500
plt.figure()
plt.plot(D[:n_ex,0],D[:n_ex,1],"w")
for k in range(K):
ii = np.where(labels[:n_ex]==k)[0]
for i in ii:
plt.text(D[i,0],D[i,1],int(labels[i]),{'color': 'C'+str(k)})
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('PCA - individuals')
plt.show()

In [ ]:


In [ ]: