The two entry level machine learning algorithms , linear and logistic regression are quite easy to understand and provide a good way to practice coding the general machine learning pipeline, in their vectorized form. Namely,
- Prepping the dataset, eg: removing outliers, adding features(polynomial multiples of existing features), normalization, feature scaling.
- Implementing the learning algorithm function.
- Calculating the loss through the chosen loss function and hypothesis.
- Optimization algorithm – Update you model’s parameters depending on the loss and ground truth.
- Maintaining a config file for the total training process.
An entry level toy dataset: pima indians diabetes dataset, has a target of one variable for each datapoint(a binary classification task of predicting whether a person has diabetes or not), will be used for the sake of this tutorial blog.
About the dataset
You can know all about the dataset on Kaggle. The dataset has around 768
datapoints and 8
features, which is quite a sweet spot for having a decent model without worrying about underfitting. The data is about female patients, specifically, their BMI, insulin level, age, skin thickness, glucose level etc. The target tells if the person has diabetes or not. 500
of all the datapoints are non-diabetic. 268
are diabetic. The data is not highly skewed so normal test accuracy should suffice(no need to find precision, recall and F1
score).
Helper Functions
For normalizing the dataset
This helps in normalizing the data and bringing them in a range of 0-1
.
def scalify_min_max(np_dataframe):
minimum_array=np.amin(np_dataframe,axis=0)
maximum_array=np.amax(np_dataframe,axis=0)
range_array = maximum_array-minimum_array
scaled = (np_dataframe-minimum_array)/range_array
return scaled
For calculating the accuracy
def accuracy_calculator(Y_out,Y):
accuracy=np.sum(np.logical_not(np.logical_xor(Y_out,Y)))/Y.shape[0]
true_positives=np.sum(np.logical_and(Y_out,Y))
false_positives=np.sum(np.logical_and(Y_out,np.logical_not(Y)))
false_negatives=np.sum(np.logical_and(np.logical_not(Y_out),Y))
precision=true_positives/(true_positives+false_positives)
recall=true_positives/(true_positives+false_negatives)
print("Precision:",precision,".Recall:",recall)
F1_score=precision*recall/(precision+recall)
return [accuracy,precision,recall,F1_score]
For preparing the dataset – creating train/val/test splits
def pre_data_prep(filename,dest_fileloc):
with open(filename,'rb') as f:
gzip_fd=gzip.GzipFile(fileobj=f)
next(gzip_fd)#Skip first row
diabetes_df = loadtxt(gzip_fd,delimiter=',',dtype=np.float32)
Y=diabetes_df[:,-1]
scaled_diabetes_df = scalify_min_max(diabetes_df[:,:-1])
concat_diabetes = np.concatenate((scaled_diabetes_df,np.array([Y]).T),axis=1)
savetxt(dest_fileloc,concat_diabetes,delimiter=',')
def dataprep(fileloc,split):
assert len(split) == 3
assert sum(split) == 1
diabetes_data = loadtxt(fileloc,delimiter=',',dtype=np.float32)
Y=np.array([diabetes_data[:,-1]]).T
classes = np.unique(Y)
assert len(classes) == 2
X=diabetes_data[:,:-1]
data_size=X.shape[0]
print(data_size,X.shape,Y.shape)
split_size=int(split[0]*data_size)
val_split=int(split[0]*data_size)
X_train=X[:split_size]
X_val=X[split_size:split_size+val_split]
X_test=X[split_size+val_split:]
Y_train=Y[:split_size]
Y_val=Y[split_size:split_size+val_split]
Y_test=Y[split_size+val_split:]
return X_train,X_val,X_test,Y_train,Y_val,Y_test
Evaluation function
For for finding accuracy of learned model on the test dataset.
def evaluate(theta_params,X,Y=None,thresh=0.5):
data_size=X.shape[0]
X_extend=np.concatenate((np.ones((data_size,1)),X),axis=1)
pred = np.greater(np.matmul(X_extend,theta_params),thresh)*1
cost=np.sum(np.square(np.matmul(X_extend,theta_params)-Y))/(data_size*2)
return pred,cost
Logistic Regression Function
def sigmoid_func(theta,X):
retval = 1/(1+np.exp(-1*np.matmul(theta.T,X)))
return retval
def logistic_regression(X,Y,learning_rate=0.001,num_iters=100,thresh=0.5,rand_seed=None):
if rand_seed!=None:#For reproducible results
np.random.seed(rand_seed)
data_size = X.shape[0]
theta_params=np.array([np.random.randn(X.shape[1]+1)]).T
#Add bias column to X
X_extend = np.concatenate((np.ones((data_size,1)),X),axis=1).T
cost=[]#Keep track of cost after each iteration of learning
for i in tqdm(range(num_iters),desc="Training.."):
h_theta=sigmoid_func(theta_params,X_extend).T#mX1
grad=np.matmul(X_extend,(h_theta-Y))/data_size#nXm*mX1=nX1
theta_params=theta_params-learning_rate*grad
cost.append(-1*np.sum(Y*np.log(h_theta)+(1-Y)*np.log(1-h_theta))/(data_size))
final_pred = np.greater(np.matmul(X_extend.T,theta_params),thresh)*1
accuracy=np.sum(np.logical_not(np.logical_xor(final_pred,Y)))/data_size
cost=np.array(cost)
return theta_params,accuracy,cost
Linear Regression Function
def linear_regression(X,Y,learning_rate=0.001,num_iters=100,thresh=0.5,rand_seed=None):
if rand_seed!=None:
np.random.seed(rand_seed)
data_size = X.shape[0]
#print(X.shape,Y.shape)
theta_params=np.array([np.random.randn(X.shape[1]+1)]).T
X_extend = np.concatenate((np.ones((data_size,1)),X),axis=1)
cost=[]
for i in tqdm(range(num_iters),desc="Training.."):
theta_params=theta_params-learning_rate*np.matmul((np.matmul(theta_params.T,X_extend.T)-Y.T),X_extend).T/data_size
cost.append(np.sum(np.square(np.matmul(X_extend,theta_params)-Y)[0])/(data_size*2))
final_pred = np.greater(np.matmul(X_extend,theta_params),thresh)*1
accuracy=np.sum(np.logical_not(np.logical_xor(final_pred,Y)))/data_size
cost=np.array(cost)
return theta_params,accuracy,cost
Runner functions for Linear and Logistic Regressions
#######################--------Linear RUNNER---------###############################
def regression_runner(fileloc,data_split_ratios,seed_values):
X_train,X_val,X_test,Y_train,Y_val,Y_test = dataprep(fileloc,data_split_ratios)
all_models=[]
all_val_accuracies=[]
random_seeds=seed_values
num_iters=500
x_axis=np.arange(num_iters)
for i in range(len(random_seeds)):
model,train_accuracy,cost=linear_regression(X_train,Y_train,rand_seed=random_seeds[i],num_iters=num_iters)
print("Trial:",i,".Train Accuracy:",train_accuracy)
all_models.append(model)
plt.plot(x_axis,cost,label=str(random_seeds[i]))
val_prediction,val_cost=evaluate(model,X_val,Y_val)
accuracy_precision=accuracy_calculator(val_prediction,Y_val)
all_val_accuracies.append(accuracy_precision[0])
print("Validation Accuracy:",accuracy_precision)
print("Validation Cost:",val_cost)
#plt.legend()
plt.title("Linear Regression")
plt.xlabel('Number of iterations')
plt.ylabel('Cost')
plt.show()
max_accuracy_idx=np.where(all_val_accuracies==np.amax(all_val_accuracies))[0][0]
best_model=all_models[max_accuracy_idx]
print(best_model.shape)
#print(X_test.shape,Y_test.shape)
test_pred,test_cost=evaluate(best_model,X_test,Y_test)
print(test_pred.shape,print(test_cost))
test_accuracy,test_precision,test_recall,test_f1=accuracy_calculator(test_pred,Y_test)
print("Test accuracy:",test_accuracy,".Test cost:",test_cost)
#####################-------------LOGISTIC RUNNER--------------##########################
def logistic_runner(fileloc,data_split_ratios,seed_values):
X_train,X_val,X_test,Y_train,Y_val,Y_test = dataprep(fileloc,data_split_ratios)
all_models=[]
all_val_accuracies=[]
random_seeds=seed_values
num_iters=1500
x_axis=np.arange(num_iters)
for i in range(10):
model,train_accuracy,cost=logistic_regression(X_train,Y_train,rand_seed=random_seeds[i],num_iters=num_iters)
print("Trial:",i,".Train Accuracy:",train_accuracy)
all_models.append(model)
plt.plot(x_axis,cost,label=str(random_seeds[i]))
val_prediction,val_cost=evaluate(model,X_val,Y_val)
accuracy_precision=accuracy_calculator(val_prediction,Y_val)
all_val_accuracies.append(accuracy_precision[0])
print("Validation Accuracy:",accuracy_precision)
print("Validation Cost:",val_cost)
#plt.legend()
plt.title("Logistic Regression")
plt.xlabel('Number of iterations')
plt.ylabel('Cost')
plt.show()
max_accuracy_idx=np.where(all_val_accuracies==np.amax(all_val_accuracies))[0][0]
best_model=all_models[max_accuracy_idx]
test_pred,test_cost=evaluate(best_model,X_test,Y_test)
#print(test_pred.shape,print(test_cost))
test_accuracy,test_precision,test_recall,test_f1=accuracy_calculator(test_pred,Y_test)
print("Test accuracy:",test_accuracy,".Test cost:",test_cost)
Training Curves
Note that each of the below two trainings was performed with 10
different values of initial theta. The initial value of theta effects the overall training performance. The best of the 10
was taken in consideration for the final evaluation on the test dataset.


Test Accuracies
Linear Regression:
Test accuracy: 0.7068965517241379 .Test cost: 0.14745936729023856
Logistic Regression:
Test accuracy: 0.646551724137931 .Test cost: 0.2865915372479961
Assimilated code
Additional Notes
- The above code does not use regularization.
- It may appear that for a few curves training was stopped prematurely, but infact the test results were more near optimal for the above training parameters.
- Although linear regression appears to be performing better for the above case it might give poorer results for other datasets.
- Note that logistic regression took 3X as many iterations as linear regression to converge.
- Initial model parameters are chosen randomly by varying the seed values. Initial model parameters(theta) effects the overall training performance, hence 10 such values were taken.