import pandas as pd
import numpy as np
# import data using pandas
data=pd.read_csv('./data/dataset.csv')


# display the dataset
data


# NAME of feature columns
FEATS_COL = ['period', 'significance', 'n', 'median', 'wmean', 'wstd', 'chi2red', 'roms', 'norm_peak_to_peak_amp',
             'norm_excess_var', 'median_abs_dev', 'iqr', 'f60', 'f70', 'f80' , 'f90', 'skew', 'smallkurt', 
             'inv_vonneumannratio', 'welch_i', 'stetson_j', 'stetson_k', 'ad', 'sw', 'f1_power', 'f1_bic','f1_amp', 
             'f1_phi0', 'f1_relamp1', 'f1_relphi1', 'f1_relamp2', 'f1_relphi2', 'f1_relamp3', 'f1_relphi3', 'f1_relamp4', 
             'f1_relphi5', 'n_ztf_alerts', 'mean_ztf_alert_braai',  
             'AllWISE__w1mpro', 'AllWISE__w1sigmpro', 'AllWISE__w2mpro', 'AllWISE__w2sigmpro', 'AllWISE__w3mpro',
             'AllWISE__w3sigmpro', 'AllWISE__w4mpro','AllWISE__w4sigmpro', 'AllWISE__ph_qual',
             'Gaia_DR2__phot_g_mean_mag', 'Gaia_DR2__phot_bp_mean_mag', 'Gaia_DR2__phot_rp_mean_mag', 'Gaia_DR2__parallax',
             'Gaia_DR2__parallax_error', 'Gaia_DR2__pmra', 'Gaia_DR2__pmra_error', 'Gaia_DR2__pmdec', 'Gaia_DR2__pmdec_error',
             'Gaia_DR2__astrometric_excess_noise', 'Gaia_DR2__phot_bp_rp_excess_factor',
             'PS1_DR1__gMeanPSFMag', 'PS1_DR1__gMeanPSFMagErr', 'PS1_DR1__rMeanPSFMag', 'PS1_DR1__rMeanPSFMagErr', 
             'PS1_DR1__iMeanPSFMag', 'PS1_DR1__iMeanPSFMagErr', 'PS1_DR1__zMeanPSFMag', 'PS1_DR1__zMeanPSFMagErr', 
             'PS1_DR1__yMeanPSFMag', 'PS1_DR1__yMeanPSFMagErr', 'PS1_DR1__qualityFlag']

# NAME of label columns
LABELS_COL = ['variable', 'periodic', 'long timescale', 'irregular', 'eclipsing', 'EA', 'EB', 'EW', 'flaring', 'bogus',
              'pulsator', 'Delta Scu', 'Cepheid', 'RR Lyrae', 'LPV', 'Mira', 'SRV', 'binary star', 'W Uma', 'Beta Lyr',
              'RS CVn', 'AGN', 'YSO']


from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder

# there is one column of string dtype, we need to convert it to int-dtype one.
label_encoder = LabelEncoder()
data['AllWISE__ph_qual']=label_encoder.fit_transform(data['AllWISE__ph_qual'])


raw_feat_data=data[FEATS_COL]
raw_feat_data


raw_label_data=data[LABELS_COL]
raw_label_data


# now rescale the feature columns
from sklearn.preprocessing import StandardScaler

feat_data = raw_feat_data.values
df_feats_values_scaled = StandardScaler().fit_transform(feat_data)
df_feats_scaled = pd.DataFrame(df_feats_values_scaled, columns=raw_feat_data.columns)
df_feats_scaled # this is the data frame we need


df_labels = raw_label_data.apply(LabelEncoder().fit_transform)
df_labels # this is the data frame we need


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_feats_scaled.values, df_labels.values, test_size=0.5, random_state=8581)


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset


# I have one beautiful GTX1660 Ti installed.
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1660 Ti'


# this is a simple classifier for single label
class classifier(nn.Module):
    def __init__(self, label_name_):
        super().__init__()
        self.fc1 = nn.Linear(in_features=16, out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=64)
        self.fc3 = nn.Linear(in_features=64, out_features=16)
        self.outputfc = nn.Linear(in_features=16, out_features=df_labels[label_name_].nunique())
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.outputfc(x)
        return x


class MLP_0(nn.Module):
    def __init__(self, model_list:list):
        super().__init__()
        # ==============AUTOENCODER================
        # encoder
        self.enc1 = nn.Linear(in_features=69, out_features=64)
        self.enc2 = nn.Linear(in_features=64, out_features=32)
        self.enc_out = nn.Linear(in_features=32, out_features=16) # output encoded tensor
        # decoder
        self.dec1 = nn.Linear(in_features=16, out_features=32)
        self.dec2 = nn.Linear(in_features=32, out_features=64)
        self.dec_out = nn.Linear(in_features=64, out_features=69) # output decoded data
        # ==========================================
        
        self.classifiermodels=[]
        for i in range(len(model_list)):
            self.classifiermodels.append(model_list[i])
        
        
    def forward(self, x):
        
        enc_data = F.relu(self.enc1(x))
        enc_data = F.relu(self.enc2(enc_data))
        enc_data = self.enc_out(enc_data)
        
        dec_data = F.relu(self.dec1(enc_data))
        dec_data = F.relu(self.dec2(dec_data))
        dec_data = self.dec_out(dec_data)
        
        outlist=[]
        for i in range(len(model_list)):
            outlist.append(self.classifiermodels[i].forward(enc_data))
        
        return dec_data, outlist


# Model, Optimizer, Loss
model_list=[classifier(label_name_).cuda() for label_name_ in LABELS_COL]


# Initialize the model.
model = MLP_0(model_list).cuda()


# number of parameters in model, from https://discuss.pytorch.org/t/how-do-i-check-the-number-of-parameters-of-a-model/4325/25
sum(p.numel() for p in model.parameters())+sum([sum(p.numel() for p in model_.parameters()) for model_ in model_list])

280550


# Hyper-parameters
epochs = 100
learning_rate = 1e-3

# DataLoader 
train_data = TensorDataset(torch.tensor(X_train).float(), torch.tensor(y_train).long())  # Pytorch uses FP32
train_ldr = torch.utils.data.DataLoader(train_data, batch_size=256, shuffle=False)


classifierparaml=[]
for model_element in model.classifiermodels:
    classifierparaml+=(model_element.parameters())


criterion_MSE = nn.MSELoss() # for reconstruction loss
criterion_CE = nn.CrossEntropyLoss() # for classification loss
optimizer = torch.optim.Adam(list(model.parameters())
                             +list(classifierparaml), lr=learning_rate)


# Training
loss_list = []
alpha=0.2
for model_s in model.classifiermodels:
    model_s.train()

for i in range(epochs):
    for (batch_idx, batch) in enumerate(train_ldr):
        #X_train_batch = batch[0] # for CPU
        X_train_batch = batch[0].cuda() # for CUDA
        #y_train_batch = batch[1] # for CPU
        y_train_batch = batch[1].cuda() # for CUDA

        X_reco, outlist_ = model.forward(X_train_batch) # This will call the forward function, usually it returns tensors.
                
        loss_1 = criterion_MSE(X_reco, X_train_batch) # reconstruction loss
        loss_2 = 0. # classification loss
        for j in range(len(LABELS_COL)):
            if j in range(23):
                loss_2 += criterion_CE(outlist_[j], y_train_batch[:,j]) # classification loss
        loss = loss_1 + loss_2
        
        
        # Zero the gradients before running the backward pass.
        optimizer.zero_grad() 
        
        # Backward pass: compute gradient of the loss with respect to all the learnable
        # parameters of the model. Internally, the parameters of each Module are stored
        # in Tensors with requires_grad=True, so this call will compute gradients for
        # all learnable parameters in the model.
        loss.backward()
        
        # Calling the step function on an Optimizer makes an update to its
        # parameters
        optimizer.step()

        loss_list.append(loss)
        if batch_idx % 50 == 0:
            #print("Epoch: {}, batch: {} Loss: {} label_loss:{}".format(i, batch_idx, loss, label_loss_))
            print("Epoch: {}, batch: {} Loss1: {:0.4f} Loss2: {:0.4f} Loss: {:0.4f}".format(i, batch_idx, loss_1, loss_2 ,loss))


# Prediction
preds = []
with torch.no_grad():
    X_reco_final, outlist_final = model.forward(torch.tensor(X_test).float().cuda())
#print("The prediction result: \n", X_reco_final.int())


# Function for evaluation:
# Calculate the accuracy for each class
from sklearn.metrics import accuracy_score

def get_accuracy(y_true, y_pred):
    acc_list=[]

    for i in range(y_true.shape[1]):
        acc_list.append(accuracy_score(y_true[:, i], y_pred[:, i]))

    return np.array(acc_list)


scorelist=[(accuracy_score(F.softmax(outlist_final[iii], dim=1).argmax(dim=1).cpu(), y_test[:, iii])) for iii in range(23)]

# Evaluation
print("The mean accuracy: ", np.mean(scorelist))

import matplotlib.pyplot as plt
plt.hist(scorelist, bins=np.linspace(0., 1, 50))
plt.title("distribution of acccuracy")
plt.show()

The mean accuracy:  0.9745180124223601

	Unnamed: 0	index	ingested	ztf_id	zvm_id	AGN	BL Her	Beta Lyr	Cepheid	Cepheid type-II	...	roms	significance	skew	smallkurt	stetson_j	stetson_k	sw	welch_i	wmean	wstd
0	102066	102066	1.0	1.071930e+13	ZTFbkgj8gih	0.0	0.0	0.0	0.0	0.0	...	0.834601	10.323892	0.446629	8.569163	75.072039	0.730721	0.442902	126.778630	17.280580	0.026293
1	62756	62756	1.0	1.048741e+13	ZTF1ujcg1f7	0.0	0.0	0.0	0.0	0.0	...	1.103918	13.238870	-0.125668	4.635338	3.555367	0.822637	0.972141	12.673600	20.784238	0.213611
2	124834	124834	1.0	1.085322e+13	ZTFhzmdxr9g	0.0	0.0	0.0	0.0	0.0	...	1.628610	42.274978	5.902958	57.228428	-13.140847	0.760559	0.958370	-9.869388	20.086235	0.216376
3	90928	90928	1.0	1.029763e+13	ZTFce4njbny	0.0	0.0	0.0	0.0	0.0	...	3.236169	125.629450	-3.545247	400.901273	-20.403730	0.869648	0.953871	-30.344550	18.132819	0.133568
4	81757	81757	1.0	1.053811e+13	ZTFe3frd9i1	0.0	0.0	0.0	0.0	0.0	...	2.147679	48.261472	9.276425	114.822709	11.968448	0.828596	0.960417	24.957636	15.564013	0.038485
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
69995	74220	74220	1.0	1.083360e+13	ZTF1cl3m3r2	0.0	0.0	0.0	0.0	0.0	...	4.779705	446.147022	145.024383	2348.694825	-74.190796	0.852355	0.890039	-359.797128	17.902267	0.151464
69996	19084	19084	1.0	1.064659e+13	ZTF8am400ku	0.0	0.0	0.0	0.0	0.0	...	10.116485	350.176987	850.973460	36210.035792	49.310596	0.887842	0.715181	156.768820	16.120440	0.144459
69997	46782	46782	1.0	1.068337e+13	ZTF328c072c	0.0	0.0	0.0	0.0	0.0	...	2.742242	48.505500	-9.957315	239.277269	-41.692685	0.867872	0.917511	-117.915088	17.547884	0.074043
69998	65224	65224	1.0	1.068345e+13	ZTFvij8xkm0	0.0	0.0	0.0	0.0	0.0	...	2.677773	55.382964	-4.305035	184.067394	3.395322	0.869434	0.945792	-10.068047	18.232474	0.121420
69999	72515	72515	1.0	1.068907e+13	ZTFxs6130x2	0.0	0.0	0.0	0.0	0.0	...	1.589231	29.093659	1.089665	34.701304	9.442207	0.810905	0.989025	20.078433	14.808053	0.019480

	period	significance	n	median	wmean	wstd	chi2red	roms	norm_peak_to_peak_amp	norm_excess_var	...	PS1_DR1__gMeanPSFMagErr	PS1_DR1__rMeanPSFMag	PS1_DR1__rMeanPSFMagErr	PS1_DR1__iMeanPSFMag	PS1_DR1__iMeanPSFMagErr	PS1_DR1__zMeanPSFMag	PS1_DR1__zMeanPSFMagErr	PS1_DR1__yMeanPSFMag	PS1_DR1__yMeanPSFMagErr	PS1_DR1__qualityFlag
0	638.203392	10.323892	263.0	17.2810	17.280580	0.026293	1.302158	0.834601	0.005928	5.959733e-07	...	0.008173	16.746901	0.002109	16.545099	0.002660	16.490801	0.003791	16.441299	0.009441	60.0
1	0.024785	13.238870	64.0	20.7980	20.784238	0.213611	1.750986	1.103918	0.013147	4.627087e-05	...	0.000000	20.716000	0.070433	19.948601	0.027574	19.569901	0.025853	19.288401	0.018201	52.0
2	0.143803	42.274978	52.0	20.0900	20.086235	0.216376	4.482082	1.628610	0.019138	1.371048e-04	...	0.216329	20.288799	0.021331	19.140900	0.027460	18.440201	0.021492	17.916401	0.016800	60.0
3	0.413181	125.629450	68.0	18.1750	18.132819	0.133568	13.397498	3.236169	0.012696	5.158521e-05	...	0.026070	18.137800	0.025632	18.086201	0.008809	18.142000	0.014741	18.162600	0.029057	52.0
4	0.408992	48.261472	80.0	15.5615	15.564013	0.038485	6.665935	2.147679	0.004172	5.185334e-06	...	0.010956	15.573000	0.002684	15.526600	0.011109	15.457800	0.010758	15.440800	0.003390	60.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
69995	0.147927	446.147022	142.0	17.8795	17.902267	0.151464	32.885707	4.779705	0.016361	8.421307e-05	...	0.036146	17.875099	0.015237	17.474800	0.005624	17.372900	0.021434	17.202000	0.005745	60.0
69996	0.142266	350.176987	68.0	16.0745	16.120440	0.144459	139.722523	10.116485	0.013583	8.378785e-05	...	0.052543	16.104900	0.027450	15.940100	0.016539	15.735300	0.008238	15.792400	0.037322	60.0
69997	0.057444	48.505500	55.0	17.5680	17.547884	0.074043	10.050414	2.742242	0.007125	1.586307e-05	...	0.020108	17.561300	0.019155	17.592699	0.008392	17.648199	0.010982	17.542500	0.020736	60.0
69998	0.051981	55.382964	55.0	18.2570	18.232474	0.121420	9.432388	2.677773	0.010803	4.006329e-05	...	0.029872	18.257601	0.022021	18.247400	0.021117	18.378099	0.008947	18.277901	0.022746	52.0
69999	0.151073	29.093659	131.0	14.8070	14.808053	0.019480	3.824048	1.589231	0.002262	1.274570e-06	...	0.002540	14.826800	0.002436	14.753200	0.004272	14.783300	0.004143	14.726300	0.003507	60.0

	variable	periodic	long timescale	irregular	eclipsing	EA	EB	EW	flaring	bogus	...	RR Lyrae	LPV	Mira	SRV	binary star	W Uma	Beta Lyr	RS CVn	AGN	YSO
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
69995	1.0	1.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0	...	0.0	0.0	0.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0
69996	1.0	1.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0	...	0.0	0.0	0.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0
69997	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
69998	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
69999	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	period	significance	n	median	wmean	wstd	chi2red	roms	norm_peak_to_peak_amp	norm_excess_var	...	PS1_DR1__gMeanPSFMagErr	PS1_DR1__rMeanPSFMag	PS1_DR1__rMeanPSFMagErr	PS1_DR1__iMeanPSFMag	PS1_DR1__iMeanPSFMagErr	PS1_DR1__zMeanPSFMag	PS1_DR1__zMeanPSFMagErr	PS1_DR1__yMeanPSFMag	PS1_DR1__yMeanPSFMagErr	PS1_DR1__qualityFlag
0	1.795541	-0.411116	1.056842	-0.080996	-0.070772	-0.729371	-0.204919	-0.618402	-0.556206	-0.090013	...	-0.505132	-0.171156	-0.711477	-0.108664	-0.679382	-0.035408	-0.589965	0.087552	-0.428986	0.202861
1	-0.169144	-0.404608	-0.609176	1.717669	1.745705	0.605940	-0.203427	-0.567115	-0.058580	-0.067296	...	-0.771984	1.529579	2.328723	1.432994	0.796992	1.331472	0.413797	1.111349	-0.121882	-0.577069
2	-0.168778	-0.339781	-0.709640	1.355583	1.383824	0.625651	-0.194344	-0.467194	0.354400	-0.022121	...	6.291251	1.346526	0.143841	1.067136	0.790237	0.829973	0.215383	0.617988	-0.170997	0.202861
3	-0.167949	-0.153682	-0.575688	0.376213	0.371072	0.035347	-0.164693	-0.161056	-0.089659	-0.064653	...	0.079213	0.424836	0.335221	0.589397	-0.315000	0.697595	-0.091769	0.706519	0.258703	-0.577069
4	-0.167962	-0.326416	-0.475225	-0.960383	-0.960730	-0.642459	-0.187080	-0.368344	-0.677252	-0.087730	...	-0.414266	-0.674166	-0.685891	-0.570005	-0.178704	-0.493979	-0.272985	-0.272220	-0.641119	0.202861
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
69995	-0.168765	0.561914	0.043836	0.225089	0.251542	0.162925	-0.099879	0.132890	0.162960	-0.048426	...	0.408199	0.312270	-0.127323	0.312456	-0.503739	0.356175	0.212744	0.361095	-0.558559	0.202861
69996	-0.168783	0.347649	-0.575688	-0.698025	-0.672249	0.112984	0.255438	1.149209	-0.028490	-0.048638	...	0.943568	-0.446250	0.416117	-0.382705	0.143071	-0.370791	-0.387639	-0.145787	0.548454	0.202861
69997	-0.169044	-0.325871	-0.684524	0.065781	0.067812	-0.388981	-0.175824	-0.255117	-0.473654	-0.082420	...	-0.115449	0.177809	0.047016	0.365860	-0.339711	0.478386	-0.262794	0.483536	-0.033011	0.202861
69998	-0.169061	-0.310516	-0.684524	0.418150	0.422739	-0.051247	-0.177880	-0.267395	-0.220118	-0.070384	...	0.203350	0.476170	0.174543	0.662414	0.414358	0.802405	-0.355381	0.747981	0.037455	-0.577069
69999	-0.168756	-0.369210	-0.048255	-1.346250	-1.352658	-0.777935	-0.196532	-0.474693	-0.808870	-0.089675	...	-0.689052	-0.993908	-0.696926	-0.920326	-0.583857	-0.793404	-0.573950	-0.529149	-0.637018	0.202861

	variable	periodic	long timescale	irregular	eclipsing	EA	EB	EW	flaring	bogus	...	RR Lyrae	LPV	Mira	SRV	binary star	W Uma	Beta Lyr	RS CVn	AGN	YSO
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	47	4	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	47	4	0	0	0	0	0	0	0	0	...	3	0	0	0	0	0	0	0	0	0
4	47	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
69995	47	4	0	0	4	0	0	5	0	0	...	0	0	0	0	4	3	0	0	0	0
69996	47	4	0	0	4	0	0	5	0	0	...	0	0	0	0	4	3	0	0	0	0
69997	47	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
69998	47	4	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
69999	47	4	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

AUTOENCODER + CLASSIFIER, a breif example for Pytorch.¶

Define lists that contain the name of features and labels.¶