# Import the necessary libraries for EDA
import pandas as pd
import altair as alt
import numpy as np

from sklearn import tree
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, SVR
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import (FunctionTransformer, Normalizer, OneHotEncoder, StandardScaler, normalize, scale)
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer
from scipy.stats import lognorm, loguniform, randint

# Import the cheese data file
cheese = pd.read_csv("data/cheese_data.csv")


# Obtain data from the cheese dataset
cheese_df = cheese.drop(columns=['CheeseId', 'Organic', 'ManufacturerProvCode', 'ManufacturingTypeEn', 'FlavourEn', 'CharacteristicsEn', 'CategoryTypeEn', 'MilkTreatmentTypeEn', 'RindTypeEn', 'CheeseName']).dropna()

# Display the first few rows of the obtained data
cheese_df.head()


# Display the dimension of the cheese data
cheese_df.shape

(1027, 3)


# Create feature vectors and target variable
X = cheese_df.drop(columns=["FatLevel"])
y = cheese_df["FatLevel"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

# Print the head of the training set
X_train.head()


# Print the shape of the training set
X_train.shape

(718, 2)


# Generate summary statistics for the dataframe
X_train.describe()


# Display information about the dataframe
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 718 entries, 687 to 111
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MoisturePercent  718 non-null    float64
 1   MilkTypeEn       718 non-null    object 
dtypes: float64(1), object(1)
memory usage: 16.8+ KB


# Create a data visualization chart
cheese_plot1 = alt.Chart(cheese_df, width=500, height=300).mark_point().encode(
    x='MoisturePercent',
    y='FatLevel',
    color='FatLevel',
    tooltip=['FatLevel']
).interactive().properties(title='The relationship between moisture and fat level')

# Display the chart
cheese_plot1


# Create a supporting data visualization chart
cheese_plot2 = alt.Chart(cheese_df, width=500, height=300).mark_point().encode(
    x='MilkTypeEn',
    y='FatLevel',
    color='FatLevel',
    tooltip=['FatLevel']
).interactive().properties(title='The relationship between milk type and fat level')

# Display the chart
cheese_plot2


# Identify the numeric, categorical, and binary columns
numeric_feats = ["MoisturePercent"]
categorical_feats = ["MilkTypeEn"]

# Preprocessing for numerical data: Impute missing values with median and scale features
numeric_transformer = make_pipeline(SimpleImputer(strategy='median'),
                                    StandardScaler()
                                   )

# Preprocessing for categorical data: Impute missing values with most frequent and encode categories as one-hot
categorical_transformer = make_pipeline(SimpleImputer(strategy='most_frequent', fill_value='missing'),
                                        OneHotEncoder(handle_unknown="ignore")
                                   )

# Combine preprocessing steps using ColumnTransformer
preprocessor = make_column_transformer(
                (numeric_transformer, numeric_feats),
                (categorical_transformer, categorical_feats),
                remainder="passthrough")


# Create the beseline model
dummy_clf = DummyClassifier(strategy="prior")

# Calculate cross-validation scores for the baseline model.
dummy_scores = pd.DataFrame(cross_validate(dummy_clf, X_train, y_train, cv=5, return_train_score=True))
dummy_scores


mean_dummy_training_score = dummy_scores['train_score'].mean()
mean_dummy_cv_score = dummy_scores['test_score'].mean()

mean_dummy_scores = pd.DataFrame({'Mean': ['training_score', 'cv_score'],
                                 'Scores': [mean_dummy_training_score, mean_dummy_cv_score]
                                 })
mean_dummy_scores


# Create the main pipeline with preprocessor and Random Forest Classifier
main_pipe = make_pipeline(preprocessor, RandomForestClassifier(random_state=77, class_weight='balanced'))

# Calculate the scores of the main pipeline using cross-validation
scores_df = pd.DataFrame(cross_validate(main_pipe, X_train, y_train, cv=5, return_train_score=True))
scores_df


mean_rfc_training_score = scores_df['train_score'].mean()
mean_rfc_cv_score = scores_df['test_score'].mean()

mean_rfc_scores = pd.DataFrame({'Mean': ['training_score', 'cv_score'],
                                 'Scores': [mean_rfc_training_score, mean_rfc_cv_score]
                               })
mean_rfc_scores


# Tuning the hyperparameters
param_grid = {
    "randomforestclassifier__max_depth": range(1,101,10)
}

depth_search = RandomizedSearchCV(main_pipe, param_grid, cv=5, n_iter=5, return_train_score=True, random_state=77, verbose=2)
depth_search.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END ...............randomforestclassifier__max_depth=21; total time=   0.3s
[CV] END ...............randomforestclassifier__max_depth=21; total time=   0.3s
[CV] END ...............randomforestclassifier__max_depth=21; total time=   0.3s
[CV] END ...............randomforestclassifier__max_depth=21; total time=   0.3s
[CV] END ...............randomforestclassifier__max_depth=21; total time=   0.3s
[CV] END ...............randomforestclassifier__max_depth=11; total time=   0.3s
[CV] END ...............randomforestclassifier__max_depth=11; total time=   0.3s
[CV] END ...............randomforestclassifier__max_depth=11; total time=   0.3s
[CV] END ...............randomforestclassifier__max_depth=11; total time=   0.3s
[CV] END ...............randomforestclassifier__max_depth=11; total time=   0.3s
[CV] END ...............randomforestclassifier__max_depth=91; total time=   0.3s
[CV] END ...............randomforestclassifier__max_depth=91; total time=   0.3s
[CV] END ...............randomforestclassifier__max_depth=91; total time=   0.3s
[CV] END ...............randomforestclassifier__max_depth=91; total time=   0.3s
[CV] END ...............randomforestclassifier__max_depth=91; total time=   0.3s
[CV] END ...............randomforestclassifier__max_depth=61; total time=   0.3s
[CV] END ...............randomforestclassifier__max_depth=61; total time=   0.3s
[CV] END ...............randomforestclassifier__max_depth=61; total time=   0.3s
[CV] END ...............randomforestclassifier__max_depth=61; total time=   0.3s
[CV] END ...............randomforestclassifier__max_depth=61; total time=   0.3s
[CV] END ................randomforestclassifier__max_depth=1; total time=   0.2s
[CV] END ................randomforestclassifier__max_depth=1; total time=   0.2s
[CV] END ................randomforestclassifier__max_depth=1; total time=   0.2s
[CV] END ................randomforestclassifier__max_depth=1; total time=   0.3s
[CV] END ................randomforestclassifier__max_depth=1; total time=   0.2s

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(remainder='passthrough',
                                                                transformers=[('pipeline-1',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer(strategy='median')),
                                                                                               ('standardscaler',
                                                                                                StandardScaler())]),
                                                                               ['MoisturePercent']),
                                                                              ('pipeline-2',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer(fill_value='missing',
                                                                                                              strategy='most_frequent')),
                                                                                               ('onehotencoder',
                                                                                                OneHotEncoder(handle_unknown='ignore'))]),
                                                                               ['MilkTypeEn'])])),
                                             ('randomforestclassifier',
                                              RandomForestClassifier(class_weight='balanced',
                                                                     random_state=77))]),
                   n_iter=5,
                   param_distributions={'randomforestclassifier__max_depth': range(1, 101, 10)},
                   random_state=77, return_train_score=True, verbose=2)


# Store the grid search results in a dataframe
grid_results = pd.DataFrame(depth_search.cv_results_, columns=['mean_test_score', 'param_randomforestclassifier__max_depth', 'mean_fit_time', 'rank_test_score'])
grid_results = grid_results.sort_values(by='rank_test_score')

grid_results


# Find the best parameters and scores
best_parameters = depth_search.best_params_
best_score = depth_search.best_score_
print("Best parameters:", best_parameters)
print("Best score:", best_score)

Best parameters: {'randomforestclassifier__max_depth': 21}
Best score: 0.8133255633255633


# Find the best model
best_model = depth_search.best_estimator_
best_model

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['MoisturePercent']),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['MilkTypeEn'])])),
                ('randomforestclassifier',
                 RandomForestClassifier(class_weight='balanced', max_depth=21,
                                        random_state=77))])


# Print classification report
report = classification_report(y_test, best_model.predict(X_test))
print(report)

              precision    recall  f1-score   support

  higher fat       0.74      0.65      0.69       105
   lower fat       0.83      0.88      0.86       204

    accuracy                           0.80       309
   macro avg       0.78      0.76      0.77       309
weighted avg       0.80      0.80      0.80       309


# Evaluate the accuracy score of the best model on the test set
test_score = best_model.score(X_test, y_test)
test_score

0.8025889967637541


# Make predictions using the best model
tr_pred = depth_search.predict(X_train)
ts_pred = depth_search.predict(X_test)


# Perform probability using the model
tr_prob = best_model.predict_proba(X_train)
ts_prob = best_model.predict_proba(X_test)


# Plot the confusion matrix for the best model on the test set
plot_confusion_matrix(best_model, X_test, y_test, values_format="d", cmap="Blues")

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f96427fc370>

	MoisturePercent	MilkTypeEn	FatLevel
0	47.0	Ewe	lower fat
1	47.9	Cow	lower fat
2	54.0	Cow	lower fat
3	47.0	Cow	lower fat
4	49.4	Cow	lower fat

	MoisturePercent	MilkTypeEn
687	54.0	Cow
885	55.0	Goat
861	46.0	Cow
967	39.0	Cow
940	42.0	Goat

	MoisturePercent
count	718.000000
mean	47.083705
std	9.785916
min	20.000000
25%	40.000000
50%	46.000000
75%	52.000000
max	92.000000

	fit_time	score_time	test_score	train_score
0	0.004391	0.004534	0.652778	0.651568
1	0.002260	0.002185	0.652778	0.651568
2	0.002279	0.001087	0.652778	0.651568
3	0.003698	0.005693	0.650350	0.652174
4	0.002368	0.001239	0.650350	0.652174

	fit_time	score_time	test_score	train_score
0	0.333828	0.036371	0.833333	0.862369
1	0.252244	0.022920	0.805556	0.865854
2	0.247132	0.022925	0.833333	0.871080
3	0.256660	0.023254	0.811189	0.866087
4	0.246029	0.034455	0.783217	0.871304

Utilizing Confusion Matrix and Accuracy for Testing Classification Models¶

| Machine Learning-Assisted Exploratory Data Analysis¶

1. Introduction¶

Data Source and Overview¶

Data schema¶

2. Preprocessing Data through Cleaning¶

3. Analysis of Data through Descriptive Statistics¶

4. Representation of Data through Visualization¶

5. Assessment of Classification Models in Machine Learning¶

Is there a correlation between fat content and moisture level, and does this correlation vary depending on the type of milk being analyzed?

5-1. Dummy Classifier¶

5-2. Random Forest Classifier¶

5-3. Evaluating Classification Models¶

5-4. Confusion Matrix¶

6. Accuracy¶

7. Remarks¶

8. References¶

	mean_test_score	param_randomforestclassifier__max_depth	mean_fit_time	rank_test_score
0	0.813326	21	0.280368	1
1	0.813326	11	0.275888	1
2	0.813326	91	0.263958	1
3	0.813326	61	0.262938	1
4	0.806352	1	0.200702	5

	Mean	Scores
0	training_score	0.651810
1	cv_score	0.651807

	Mean	Scores
0	training_score	0.867339
1	cv_score	0.813326