Download the data at the following address: http://geometrica.saclay.inria.fr/team/Fred.Chazal/slides/data_acc.dat, save it as a file named data_acc.dat, and load it using the pickle module:
import numpy as np
import pickle as pickle
import gudhi as gd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import manifold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
f = open("data_acc.dat","rb")
data = pickle.load(f,encoding="latin1")
f.close()
data_A = data[0]
data_B = data[1]
data_C = data[2]
label = data[3]
The walk of 3 persons A, B and C has been recorded using the accelerometer sensor of a smartphone in their pocket, giving rise to 3 multivariate time series in $\mathbb{R}^3$: each time series represents the 3 coordinates of the acceleration of the corresponding person in a coordinate system attached to the sensor (take care that, as the smartphone was carried in a possibly different position for each person, these time series cannot be compared coordinates by coordinates). Using a sliding window, each series has been split in a list of 100 time series made of 200 consecutive points, that are now stored in data_A, data_B and data_C.
data_A_sample = data_A[0]
plt.gca(projection='3d')
plt.plot(data_A_sample [:,0],data_A_sample [:,1],data_A_sample [:,2])
few = 20 # we don't have hours, use less than 100
data_ABC = np.concatenate((data_A[0:few],data_B[0:few],data_C[0:few]),axis=0)
nb_data = np.shape(data_ABC)[0]
#Compute all persistence diagrams
diag_list_dim0 = []
diag_list_dim1 = []
for i in range(nb_data):
pt_cloud = data_ABC[i,:,:]
alpha_complex = gd.AlphaComplex(points=pt_cloud)
st = alpha_complex.create_simplex_tree()
diag = st.persistence()
diag_list_dim0.append(st.persistence_intervals_in_dimension(0))
diag_list_dim1.append(st.persistence_intervals_in_dimension(1))
if i%30 == 0:
print('number of computed diagrams:', i)
#Compute pairwise distance matrices
B0= np.zeros((nb_data,nb_data))
B1 =np.zeros((nb_data,nb_data))
for i in range(nb_data):
for j in range(i):
B0[i,j] = gd.bottleneck_distance(diag_list_dim0[i], diag_list_dim0[j])
B1[i,j] = gd.bottleneck_distance(diag_list_dim1[i], diag_list_dim1[j])
if i%10 == 0:
print('number of rows computed:', i)
B0 = B0 + B0.transpose()
B1 = B1 + B1.transpose()
pickle.dump(B0, open("B0.dat", "wb"))
pickle.dump(B1, open("B1.dat", "wb"))
#Load pairwise distance matrices
B0 = pickle.load(open("B0.dat", "rb"), encoding="latin1")
B1 = pickle.load(open("B1.dat", "rb"), encoding="latin1")
#plt.imshow(B0, cmap='gray', interpolation='nearest');
plt.imshow(B1, cmap='gray', interpolation='nearest');
## MDS on pairwise distances between diagrams
colors = ("red","green","blue","black","purple","cyan")
label_color = []
for i in range(few):
label_color.append(colors[0])
for i in range(few):
label_color.append(colors[1])
for i in range(few):
label_color.append(colors[2])
mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9,
dissimilarity="precomputed", n_jobs=1)
pos0 = mds.fit(B0).embedding_
fig = plt.figure()
ax = fig.add_subplot(121)
ax.scatter(pos0[:,0], pos0[:, 1], marker = 'o', color=label_color)
mds = manifold.MDS(n_components=3, max_iter=3000, eps=1e-9,
dissimilarity="precomputed", n_jobs=1)
pos1 = mds.fit(B1).embedding_
ax = fig.add_subplot(122, projection='3d')
ax.scatter(pos1[:,0], pos1[:, 1], pos1[:,2], marker = 'o', color=label_color)
# B is the pairwise distance matrix between 0 or 1-dim dgms
#label_color contains the colors corresponding to the class of each dgm
mds = manifold.MDS(n_components=3, max_iter=3000, eps=1e-9, dissimilarity="precomputed", n_jobs=1)
pos1 = mds.fit(B1).embedding_
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(pos1[:,0], pos1[:, 1], pos1[:,2], marker = 'o', color=label_color)
def sliding_window_data(x,edim,delay=1):
"""time delay embedding of a d-dim times series into R^{d*edim}
the time series is assumed to be periodic
parameters:
+ x: a list of d lists of same length L or a dxL numpy array
+ edim: the number of points taken to build the embedding in R^{d*edim}
+ delay: embeeding given by (x[i],x[i+delay],...,x[i + (edim-1)*delay])
Default value for delay is 1
"""
ts = np.asarray(x)
if len(np.shape(ts)) == 1:
ts = np.reshape(ts,(1,ts.shape[0]))
ts_d = ts.shape[0]
ts_length = ts.shape[1]
#output = zeros((edim*ts_d,nb_pt))
output = ts
for i in range(edim-1):
output = np.concatenate((output,np.roll(ts,-(i+1)*delay,axis=1)),axis=0)
return output
edim=3
delay=2
diag_list = []
diag_list_TD_dim0 = []
diag_list_TD_dim1 = []
for i in range(nb_data):
pt_cloud = sliding_window_data(np.transpose(data_ABC[i,:,:]),edim,delay)
pt_cloud = np.transpose(pt_cloud)
rips_complex = gd.RipsComplex(pt_cloud,max_edge_length=100.0)
st = rips_complex.create_simplex_tree(max_dimension=2)
diag_list.append(st.persistence())
diag_list_TD_dim0.append(st.persistence_intervals_in_dimension(0))
diag_list_TD_dim1.append(st.persistence_intervals_in_dimension(1))
if i%10 == 0:
print('number of computed diagrams:', i)
pickle.dump(diag_list, open("Diags_accelero.dat", "wb"))
B_TD_0= np.zeros((nb_data,nb_data))
B_TD_1 =np.zeros((nb_data,nb_data))
for i in range(nb_data):
for j in range(i):
B_TD_0[i,j] = gd.bottleneck_distance(diag_list_TD_dim0[i], diag_list_TD_dim0[j])
B_TD_1[i,j] = gd.bottleneck_distance(diag_list_TD_dim1[i], diag_list_TD_dim1[j])
if i%10 == 0:
print('number of rows computed:',i)
B_TD_0 = B_TD_0 + B_TD_0.transpose()
B_TD_1 = B_TD_1 + B_TD_1.transpose()
pickle.dump(B_TD_0, open("B_TD_0.dat", "wb"))
pickle.dump(B_TD_1, open("B_TD_1.dat", "wb"))
#Load pairwise distance matrices
B_TD_0 = pickle.load(open("B_TD_0.dat", "rb"), encoding="latin1")
B_TD_1 = pickle.load(open("B_TD_1.dat", "rb"), encoding="latin1")
mds = manifold.MDS(n_components=3, max_iter=3000, eps=1e-9,
dissimilarity="precomputed", n_jobs=1)
pos0 = mds.fit(B_TD_0).embedding_
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(pos0[:,0], pos0[:, 1], pos0[:,2], marker = 'o', color=label_color)
mds = manifold.MDS(n_components=3, max_iter=3000, eps=1e-9,
dissimilarity="precomputed", n_jobs=1)
pos1 = mds.fit(B_TD_1).embedding_
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(pos1[:,0], pos1[:, 1], pos1[:,2], marker = 'o', color=label_color)
Landscape construction is currently only available in the C++ version of Gudhi. Here is a simple python implementation you can use for this class.
def landscapes_approx(diag,p_dim,x_min,x_max,nb_nodes,nb_ld):
"""Compute a dicretization of the first nb_ld landscape of a
p_dim-dimensional persistence diagram on a regular grid on the
interval [x_min,x_max]. The output is a nb_ld x nb_nodes numpy
array
+ diag: a persistence diagram (in the Gudhi format)
+ p_dim: the dimension in homology to consider
"""
landscape = np.zeros((nb_ld,nb_nodes))
diag_dim = []
for pair in diag: #get persistence points for homology in dimension dim
if (pair[0] == p_dim):
diag_dim.append(pair[1])
step = (x_max - x_min) / (nb_nodes - 1)
#Warning: naive and not the most efficient way to proceed!!!!!
for i in range(nb_nodes):
x = x_min + i * step
t = x / np.sqrt(2)
event_list = []
for pair in diag_dim:
b = pair[0]
d = pair[1]
if b <= t <= d:
if t >= (d+b)/2:
event_list.append((d-t)*np.sqrt(2))
else:
event_list.append((t-b)*np.sqrt(2))
event_list.sort(reverse=True)
event_list = np.asarray(event_list)
for j in range(nb_ld):
if(j<len(event_list)):
landscape[j,i]=event_list[j]
return landscape
diag_list=pickle.load(open("Diags_accelero.dat", "rb"))
nb_ld = 5 # number of Landscapes
nb_nodes = 500
length_max = 1.0
gd.plot_persistence_diagram(diag_list[1])
L=landscapes_approx(diag_list[2],1,0,length_max,nb_nodes,nb_ld)
plt.plot(L.transpose())
L0_list = []
L1_list = []
L_list = []
for dgm in diag_list:
L0 = landscapes_approx(dgm,0,0,length_max,nb_nodes,nb_ld)
L0 = L0.ravel()
L1 = landscapes_approx(dgm,1,0,length_max,nb_nodes,nb_ld)
L1 = L1.ravel()
L0_list.append(L0)
L1_list.append(L1)
L = np.append(L0,L1)
L_list.append(L)
L_dist0 = np.zeros((nb_data,nb_data))
L_dist1 = np.zeros((nb_data,nb_data))
L_dist = np.zeros((nb_data,nb_data))
for i in range(nb_data):
for j in range(i):
L_dist0[i,j] = np.linalg.norm(L0_list[i]-L0_list[j])
L_dist1[i,j] = np.linalg.norm(L1_list[i]-L1_list[j])
L_dist[i,j] = np.linalg.norm(L_list[i]-L_list[j])
L_dist0 = L_dist0 + L_dist0.transpose()
L_dist1 = L_dist1 + L_dist1.transpose()
L_dist = L_dist + L_dist.transpose()
plt.imshow(L_dist0, cmap='gray', interpolation='nearest')
plt.imshow(L_dist1, cmap='gray', interpolation='nearest')
plt.imshow(L_dist, cmap='gray', interpolation='nearest')
sublabel=np.concatenate((label[0:few],label[100:100+few],label[200:200+few]))
# Classification with random forests
# Interesting compare with L0_list, L1_list and L_list
avg = 0
for i in range(20):
L_train, L_test, label_train, label_test = train_test_split(L1_list, sublabel, test_size=0.2)
RF = RandomForestClassifier()
RF.fit(L_train, label_train)
print(np.mean(RF.predict(L_test) == label_test) )
avg += np.mean(RF.predict(L_test) == label_test)
#print(confusion_matrix(RF.predict(L_test), label_test))
print ("avg pred: ",avg/20)
plt.plot(RF.feature_importances_)
# Pretty bad results
kmeans = KMeans(n_clusters=3, init='k-means++',random_state=0).fit(L1_list)
kmeans.labels_
The goal of this exercise is to implement the bootstrap algorithm below from [ F. Chazal, B.T. Fasy, F. Lecci, A. Rinaldo, L. Wasserman. Stochastic Convergence of Persistence Landscapes and Silhouettes. in Journal of Computational Geometry, 6(2), 140-161, 2015] to compute confidence bands for landscapes. As an example compute confidence bands for the expected landscapes for each of the 3 classes in the accelerometer data set.
Input: landscapes $\lambda_1,\ldots,\lambda_n$; confidence level $1-\alpha$; number of bootstrap samples $B$
Output: confidence functions $\ell_n, u_n \colon \mathbb{R} \to \mathbb{R}$