{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Analysis Script\n",
    "\n",
    "This notebook performs comprehensive data analysis including preprocessing, statistical analysis, and machine learning techniques.\n",
    "\n",
    "**Created on:** Sat Oct 28 16:00:55 2023  \n",
    "**Author:** mm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Import Libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from statsmodels.formula.api import ols\n",
    "from statsmodels.stats.multicomp import MultiComparison\n",
    "import statsmodels.api as sm\n",
    "from scipy.stats import pearsonr\n",
    "from sklearn.decomposition import PCA\n",
    "import scipy.cluster.hierarchy as sch\n",
    "from sklearn.cross_decomposition import PLSRegression\n",
    "from sklearn.model_selection import cross_val_predict\n",
    "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from statsmodels.stats.multicomp import pairwise_tukeyhsd\n",
    "from sklearn.experimental import enable_iterative_imputer  # noqa\n",
    "from sklearn.impute import IterativeImputer\n",
    "from mpl_toolkits.mplot3d import Axes3D\n",
    "from itertools import combinations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Import Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import data\n",
    "df = pd.read_csv(\"data_rice.csv\", sep=\",\")\n",
    "print(df.columns.values.tolist())\n",
    "print(df.dtypes)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Data Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a copy of the dataframe\n",
    "XY = df.copy()\n",
    "\n",
    "# Step 1: Check column types and separate qualitative and quantitative data\n",
    "qualitative_cols = [col for col in XY.columns if XY[col].dtype == 'object']\n",
    "quantitative_cols = [col for col in XY.columns if col not in qualitative_cols]\n",
    "\n",
    "# Step 2: Separate columns containing a mix of text and numbers\n",
    "mixed_cols = [col for col in qualitative_cols if XY[col].apply(lambda x: isinstance(x, str) and any(c.isdigit() for c in x) and any(c.isalpha() for c in x)).any()]\n",
    "mixed_df = XY[mixed_cols].copy()\n",
    "\n",
    "# Step 3: Identify text columns without missing or mixing\n",
    "text_no_mix_missing = [col for col in qualitative_cols if col not in mixed_cols and not XY[col].isnull().any()]\n",
    "\n",
    "# Step 4: Identify text columns with missing data\n",
    "text_missing = [col for col in qualitative_cols if XY[col].isnull().any()]\n",
    "\n",
    "# Step 5: Identify qualitative columns without missing data\n",
    "qualitative_no_missing = [col for col in qualitative_cols if col not in mixed_cols and col not in text_missing]\n",
    "\n",
    "# Step 6: Identify qualitative columns with missing data\n",
    "qualitative_missing = [col for col in qualitative_cols if col not in mixed_cols and col not in text_no_mix_missing]\n",
    "\n",
    "# Step 7: Identify quantitative columns without missing data\n",
    "quantitative_no_missing = [col for col in quantitative_cols if not XY[col].isnull().any()]\n",
    "\n",
    "# Step 8: Identify quantitative columns with missing data\n",
    "quantitative_missing = [col for col in quantitative_cols if XY[col].isnull().any()]\n",
    "\n",
    "# Step 9: Separate mixed columns into those with missing and those without\n",
    "mixed_no_missing = [col for col in mixed_cols if not XY[col].isnull().any()]\n",
    "mixed_missing = [col for col in mixed_cols if XY[col].isnull().any()]\n",
    "\n",
    "# Step 10: Order columns as specified\n",
    "ordered_cols = text_no_mix_missing + qualitative_no_missing + quantitative_no_missing + mixed_no_missing + text_missing + qualitative_missing + quantitative_missing + mixed_missing\n",
    "\n",
    "# Now, XY contains all columns ordered as specified\n",
    "XY = XY[ordered_cols]\n",
    "\n",
    "# Step 11: Drop columns with duplicate names, keeping only the first entry\n",
    "XY = XY.loc[:, ~XY.columns.duplicated()]\n",
    "\n",
    "# Keep only selected columns\n",
    "keep_cols = mixed_no_missing + text_no_mix_missing + qualitative_no_missing + quantitative_no_missing \n",
    "XY = XY[keep_cols]\n",
    "\n",
    "print(\"\\nCleaned DataFrame XY:\")\n",
    "print(XY)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Handle Numeric Categorical Columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This function identifies numeric columns in the DataFrame that can be considered categorical\n",
    "def find_numeric_categorical_columns(df, max_categories=5):\n",
    "    numeric_categorical_cols = []\n",
    "    \n",
    "    for column in df.select_dtypes(include=['int64', 'float64']).columns:\n",
    "        unique_values = df[column].nunique()\n",
    "        \n",
    "        # Check if the column has a limited number of unique values\n",
    "        if unique_values <= max_categories:\n",
    "            numeric_categorical_cols.append(column)\n",
    "    \n",
    "    return numeric_categorical_cols\n",
    "\n",
    "# This function converts specified columns in the DataFrame to categorical data type\n",
    "def convert_to_categorical(df, columns):\n",
    "    for column in columns:\n",
    "        df[column] = df[column].astype('category')\n",
    "    return df\n",
    "\n",
    "# Example usage:\n",
    "df_all = df.copy()\n",
    "numeric_categorical_columns = find_numeric_categorical_columns(df, max_categories=5)\n",
    "df = convert_to_categorical(df, numeric_categorical_columns)\n",
    "\n",
    "print(df.dtypes)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Remove Sequential Numeric Columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to remove columns that contain sequential numerical counting\n",
    "def remove_sequential_numeric_columns(df):\n",
    "    columns_to_drop = []\n",
    "    \n",
    "    for column in df.columns:\n",
    "        # Check if the column is a numeric column\n",
    "        if pd.api.types.is_numeric_dtype(df[column]):\n",
    "            # Sort values and check if they are sequential starting from 1\n",
    "            sorted_values = df[column].dropna().sort_values().unique()\n",
    "            if (sorted_values == range(1, len(sorted_values) + 1)).all():\n",
    "                columns_to_drop.append(column)\n",
    "    \n",
    "    # Drop identified columns\n",
    "    df = df.drop(columns=columns_to_drop)\n",
    "    return df\n",
    "\n",
    "# Example usage:\n",
    "df = remove_sequential_numeric_columns(df)\n",
    "\n",
    "# Function to convert object columns in a DataFrame to categorical type\n",
    "def convert_object_to_categorical(df):\n",
    "    for column in df.select_dtypes(include='object').columns:\n",
    "        df[column] = df[column].astype('category')\n",
    "    return df\n",
    "\n",
    "# Convert object columns to categorical\n",
    "df = convert_object_to_categorical(df)\n",
    "print(df.dtypes)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Separate and Normalize Quantitative Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Selecting only non-numeric (qualitative) columns\n",
    "qualitative_data = df.select_dtypes(exclude=['number'])\n",
    "# Selecting only numeric (quantitative) columns\n",
    "quantitative_data = df.select_dtypes(include=['number'])\n",
    "\n",
    "# 1. Normalize the quantitative data using z-scores\n",
    "scaler_z = StandardScaler()\n",
    "quantitative_data_z = pd.DataFrame(scaler_z.fit_transform(quantitative_data), columns=quantitative_data.columns)\n",
    "\n",
    "# 2. Normalize the quantitative data using Min-Max scaling\n",
    "scaler_mm = MinMaxScaler()\n",
    "quantitative_data_mm = pd.DataFrame(scaler_mm.fit_transform(quantitative_data), columns=quantitative_data.columns)\n",
    "\n",
    "# Displaying the first few rows of the normalized data\n",
    "quantitative_data_z.head(), quantitative_data_mm.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Quantitative Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "quantitative_vars = df.select_dtypes(include=['int', 'float']).columns.tolist()\n",
    "\n",
    "# Applying Min-Max Normalization to the quantitative variables\n",
    "scaler = MinMaxScaler()\n",
    "df_min_max = df.copy()\n",
    "df_min_max[quantitative_vars] = scaler.fit_transform(df[quantitative_vars])\n",
    "\n",
    "# Function to create boxplots for normalized and non-normalized data\n",
    "def plot_boxplots(df_original, df_normalized, variables):\n",
    "    fig, axs = plt.subplots(nrows=2, figsize=(15, 10))\n",
    "\n",
    "    sns.boxplot(data=df_original[variables], ax=axs[0])\n",
    "    axs[0].set_title('Boxplot (Original Data)', fontsize=16)\n",
    "    axs[0].tick_params(axis='x', rotation=45)\n",
    "\n",
    "    sns.boxplot(data=df_normalized[variables], ax=axs[1])\n",
    "    axs[1].set_title('Boxplot (Normalized Data)', fontsize=16)\n",
    "    axs[1].tick_params(axis='x', rotation=45)\n",
    "\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "\n",
    "# Creating boxplots for all variables together\n",
    "plot_boxplots(df, df_min_max, quantitative_vars)\n",
    "\n",
    "df_descr = df.describe()\n",
    "df_descr.to_csv('03df_descr.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Calculate Mean and RSD by Category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Automatically detect quantitative and categorical columns\n",
    "categorical_columns = df.select_dtypes(include=['object', 'category']).columns\n",
    "quantitative_columns = df.select_dtypes(include=[np.number]).columns\n",
    "\n",
    "# Dictionary to store combined results for each categorical column\n",
    "combined_results = {}\n",
    "\n",
    "# Loop through each categorical column to calculate grouped means and RSDs\n",
    "for cat_col in categorical_columns:\n",
    "    # Select only the quantitative columns along with the current categorical column for grouping\n",
    "    DT = df[[cat_col] + list(quantitative_columns)]\n",
    "    \n",
    "    # Calculate mean and RSD for each group based on the current categorical column\n",
    "    DT_mean = DT.groupby(cat_col).mean().round(3)\n",
    "    DT_rsd = (DT.groupby(cat_col).std() / DT.groupby(cat_col).mean() * 100).round(1)\n",
    "    \n",
    "    # Combine the means and RSDs by interleaving columns\n",
    "    combined_df = pd.DataFrame()\n",
    "    for col in DT_mean.columns:\n",
    "        combined_df[col + '_mean'] = DT_mean[col]\n",
    "        combined_df[col + '_sd'] = DT_rsd[col]\n",
    "    \n",
    "    # Save the individual results to CSV (optional)\n",
    "    DT_mean.to_csv(f'{cat_col}_DT_mean.csv', index=True)\n",
    "    DT_rsd.to_csv(f'{cat_col}_DT_rsd.csv', index=True)\n",
    "    combined_df.to_csv(f'{cat_col}_combined_mean_sd.csv', index=True)\n",
    "    \n",
    "    # Store the combined result in the dictionary with the categorical column as key\n",
    "    combined_results[cat_col] = combined_df\n",
    "\n",
    "# If you want to combine all results into one large DataFrame with hierarchical columns\n",
    "final_combined_df = pd.concat(combined_results, axis=1)\n",
    "final_combined_df.to_csv('all_categoricals_combined_mean_sd.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9. Outlier Detection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define your detect_outliers function\n",
    "def detect_outliers(series):\n",
    "    # Example: Using the IQR method\n",
    "    Q1 = series.quantile(0.25)\n",
    "    Q3 = series.quantile(0.75)\n",
    "    IQR = Q3 - Q1\n",
    "    return (series < (Q1 - 1.5 * IQR)) | (series > (Q3 + 1.5 * IQR))\n",
    "\n",
    "# Automatically detect quantitative and categorical columns\n",
    "categorical_columns = df.select_dtypes(include=['object', 'category']).columns\n",
    "quantitative_columns = df.select_dtypes(include=[np.number]).columns\n",
    "\n",
    "# Dictionary to store outlier DataFrames for each categorical column\n",
    "outliers_results = {}\n",
    "\n",
    "# Loop through each categorical column\n",
    "for cat_col in categorical_columns:\n",
    "    # Select quantitative columns along with the current categorical column for grouping\n",
    "    outliers_df = df[[cat_col] + list(quantitative_columns)].copy()\n",
    "\n",
    "    # Detect outliers for each numeric column grouped by the categorical column\n",
    "    for col in quantitative_columns:\n",
    "        outliers_df[col] = df.groupby(cat_col)[col].transform(detect_outliers)\n",
    "\n",
    "    # Filter rows with at least one outlier in any of the quantitative columns\n",
    "    outliers_sel = outliers_df[quantitative_columns].any(axis=1)\n",
    "    outliers_sel_df = df[outliers_sel]\n",
    "\n",
    "    # Replace outliers with missing data (NA) in a copy of the DataFrame\n",
    "    dfNA = df[[cat_col] + list(quantitative_columns)].copy()\n",
    "    for col in quantitative_columns:\n",
    "        # Apply outlier detection within each group and align the index properly\n",
    "        outlier_mask = df.groupby(cat_col)[col].transform(lambda x: detect_outliers(x))\n",
    "        dfNA[col] = df[col].where(~outlier_mask)\n",
    "\n",
    "    # Count missing values per column and in total for each categorical grouping\n",
    "    na_count = dfNA.isna().sum()\n",
    "    total_na_count = dfNA.isna().sum().sum()\n",
    "\n",
    "    # Save results to CSV files\n",
    "    outliers_df.to_csv(f'outliers_df_{cat_col}.csv', index=False)\n",
    "    dfNA.to_csv(f'dfNA_{cat_col}.csv', index=False)\n",
    "    \n",
    "    # Store results in a dictionary for easy access or further analysis if needed\n",
    "    outliers_results[cat_col] = {\n",
    "        'outliers_df': outliers_df,\n",
    "        'outliers_sel_df': outliers_sel_df,\n",
    "        'dfNA': dfNA,\n",
    "        'na_count': na_count,\n",
    "        'total_na_count': total_na_count\n",
    "    }\n",
    "\n",
    "# Print missing value counts for each categorical grouping\n",
    "for cat_col, results in outliers_results.items():\n",
    "    print(f\"Missing value counts for {cat_col}:\")\n",
    "    print(results['na_count'])\n",
    "    print(f\"Total missing values: {results['total_na_count']}\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 10. Missing Value Imputation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert each categorical column to numerical codes for imputation\n",
    "dfNA = df.copy()\n",
    "for cat_col in categorical_columns:\n",
    "    dfNA[cat_col] = pd.Categorical(dfNA[cat_col]).codes\n",
    "\n",
    "# Initialize MICE imputer\n",
    "mice_imputer = IterativeImputer(random_state=123, max_iter=50, imputation_order='ascending')\n",
    "\n",
    "# Perform MICE imputation\n",
    "dfNA_imputed = mice_imputer.fit_transform(dfNA)\n",
    "\n",
    "# Convert imputed data back to DataFrame\n",
    "dfNA_imputed = pd.DataFrame(dfNA_imputed, columns=dfNA.columns)\n",
    "\n",
    "# Convert numerical codes back to original categorical values\n",
    "for cat_col in categorical_columns:\n",
    "    original_categories = df[cat_col].astype('category').cat.categories\n",
    "    dfNA_imputed[cat_col] = pd.Categorical.from_codes(dfNA_imputed[cat_col].astype(int), categories=original_categories)\n",
    "\n",
    "# Select columns needed for further processing\n",
    "Xlm = dfNA_imputed[quantitative_columns]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 11. Qualitative Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Detect categorical columns\n",
    "categorical_columns = df.select_dtypes(include=['object', 'category']).columns\n",
    "\n",
    "# Check if there are categorical columns\n",
    "if len(categorical_columns) == 0:\n",
    "    print(\"No categorical columns found in the DataFrame.\")\n",
    "elif len(categorical_columns) == 1:\n",
    "    # Handle case with only one categorical column\n",
    "    col = categorical_columns[0]\n",
    "    print(f\"Only one categorical column found: {col}\")\n",
    "    \n",
    "    # Plot frequency distribution\n",
    "    plt.figure(figsize=(8, 5))\n",
    "    sns.countplot(x=col, data=df, palette='viridis')\n",
    "    plt.title(f'Frequency Distribution of {col}')\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "else:\n",
    "    # Handle case with multiple categorical columns\n",
    "    print(\"Multiple categorical columns found. Generating contingency tables and plots.\")\n",
    "    \n",
    "    # Create pairwise contingency tables\n",
    "    for col1, col2 in combinations(categorical_columns, 2):\n",
    "        # Generate contingency table for each pair of categorical columns\n",
    "        contingency_table = pd.crosstab(df[col1], df[col2])\n",
    "        \n",
    "        # Print and save contingency table\n",
    "        print(f\"Contingency Table {col1} vs {col2}:\")\n",
    "        print(contingency_table)\n",
    "        contingency_table.to_csv(f'contingency_table_{col1}_vs_{col2}.csv', index=True)\n",
    "\n",
    "    # Plot frequency distributions for each categorical column\n",
    "    fig, axes = plt.subplots(1, len(categorical_columns), figsize=(6 * len(categorical_columns), 5))\n",
    "    \n",
    "    for i, col in enumerate(categorical_columns):\n",
    "        sns.countplot(ax=axes[i], x=col, data=df, palette='viridis')\n",
    "        axes[i].set_title(f'Frequency Distribution of {col}')\n",
    "    \n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 12. ANOVA and Tukey's HSD Test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to perform ANOVA and Tukey's HSD test\n",
    "def anova_tukey(data, quantitative_vars, qualitative_vars):\n",
    "    results = []\n",
    "    \n",
    "    for q_var in quantitative_vars:\n",
    "        for ql_var in qualitative_vars:\n",
    "            # Building the ANOVA model\n",
    "            model = ols(f'{q_var} ~ C({ql_var})', data=data).fit()\n",
    "            \n",
    "            # Performing ANOVA\n",
    "            anova_table = sm.stats.anova_lm(model, typ=2)\n",
    "            \n",
    "            # Performing Tukey's HSD test\n",
    "            mc = MultiComparison(data[q_var], data[ql_var])\n",
    "            tukey_result = mc.tukeyhsd()\n",
    "            \n",
    "            # Appending results to the list\n",
    "            results.append({\n",
    "                'Quantitative Variable': q_var,\n",
    "                'Qualitative Variable': ql_var,\n",
    "                'ANOVA': anova_table,\n",
    "                'Tukey': tukey_result\n",
    "            })\n",
    "            \n",
    "    return results\n",
    "\n",
    "# Performing ANOVA and Tukey's HSD test for quantitative variables vs qualitative variables\n",
    "quantitative_vars = df.select_dtypes(include=['int', 'float']).columns.tolist()\n",
    "qualitative_vars = df.select_dtypes(include=['object', 'category']).columns.tolist()\n",
    "\n",
    "anova_results = anova_tukey(df, quantitative_vars, qualitative_vars)\n",
    "\n",
    "# Save results to CSV\n",
    "results_df = pd.DataFrame(anova_results)\n",
    "results_df.to_csv('05anova_results.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 13. Bivariate Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Correlation matrix\n",
    "correlation_matrix = df[quantitative_vars].corr()\n",
    "\n",
    "# Plotting the correlation matrix\n",
    "plt.figure(figsize=(10, 8))\n",
    "sns.heatmap(correlation_matrix, annot=True, fmt=\".2f\", cmap='coolwarm', vmin=-1, vmax=1)\n",
    "plt.title('Correlation Matrix', fontsize=16)\n",
    "plt.show()\n",
    "\n",
    "# Covariance matrix\n",
    "covariance_matrix = df[quantitative_vars].cov()\n",
    "\n",
    "# Plotting the covariance matrix\n",
    "plt.figure(figsize=(10, 8))\n",
    "sns.heatmap(covariance_matrix, annot=True, fmt=\".2f\", cmap='coolwarm')\n",
    "plt.title('Covariance Matrix', fontsize=16)\n",
    "plt.show()\n",
    "\n",
    "# Function to calculate the matrix of p-values for correlations\n",
    "def calculate_pvalues(df):\n",
    "    df = df.dropna()._get_numeric_data()\n",
    "    dfcols = pd.DataFrame(columns=df.columns)\n",
    "    pvalues = dfcols.transpose().join(dfcols, how='outer')\n",
    "    for r in df.columns:\n",
    "        for c in df.columns:\n",
    "            pvalues[r][c] = round(pearsonr(df[r], df[c])[1], 4)\n",
    "    return pvalues\n",
    "\n",
    "# Calculating the matrix of p-values\n",
    "pvalues_matrix = calculate_pvalues(df[quantitative_vars])\n",
    "\n",
    "# Plotting the matrix of p-values\n",
    "plt.figure(figsize=(10, 8))\n",
    "sns.heatmap(pvalues_matrix.astype(float), annot=True, fmt=\".4f\", cmap='coolwarm', vmin=0, vmax=1)\n",
    "plt.title('Matrix of P-values for Correlations', fontsize=16)\n",
    "plt.show()\n",
    "\n",
    "# Save results\n",
    "pvalues_matrix.to_csv('06pvalues_matrix.csv', index=False)\n",
    "correlation_matrix.to_csv('07correlation_matrix.csv', index=False)\n",
    "covariance_matrix.to_csv('08covariance_matrix.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 14. Principal Component Analysis (PCA)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Redefining pca_normalized\n",
    "pca_normalized = PCA(n_components=len(quantitative_vars))\n",
    "\n",
    "# Redoing PCA using the min-max normalized data\n",
    "principal_components_normalized = pca_normalized.fit_transform(df_min_max[quantitative_vars])\n",
    "df_pca_normalized = pd.DataFrame(data=principal_components_normalized, columns=[f'PC{i+1}' for i in range(pca_normalized.n_components_)])\n",
    "\n",
    "# Compute loading scores\n",
    "loading_scores = pca_normalized.components_.T * np.sqrt(pca_normalized.explained_variance_)\n",
    "\n",
    "# Plotting\n",
    "plt.figure(figsize=(8, 6))\n",
    "\n",
    "# Score plot\n",
    "sns.scatterplot(x='PC1', y='PC2', data=df_pca_normalized, palette='viridis', label='Scores')\n",
    "plt.title('PCA (Normalized Data): Score Plot (PC1 vs PC2)', fontsize=16)\n",
    "plt.xlabel('Principal Component 1', fontsize=12)\n",
    "plt.ylabel('Principal Component 2', fontsize=12)\n",
    "plt.grid(True)\n",
    "\n",
    "# Loading plot\n",
    "for i, (x, y) in enumerate(zip(pca_normalized.components_[0], pca_normalized.components_[1])):\n",
    "    plt.arrow(0, 0, x, y, color='r', alpha=0.5, width=0.005)\n",
    "    plt.text(x, y, quantitative_vars[i], color='g', fontsize=12)\n",
    "\n",
    "plt.show()\n",
    "\n",
    "# 3D Graph\n",
    "fig = plt.figure(figsize=(10, 8))\n",
    "ax = fig.add_subplot(111, projection='3d')\n",
    "\n",
    "# Score plot in 3D\n",
    "ax.scatter(df_pca_normalized['PC1'], df_pca_normalized['PC2'], df_pca_normalized['PC3'], c='skyblue', s=60, label='Scores')\n",
    "ax.view_init(30, 185)\n",
    "ax.set_xlabel('Principal Component 1', fontsize=12)\n",
    "ax.set_ylabel('Principal Component 2', fontsize=12)\n",
    "ax.set_zlabel('Principal Component 3', fontsize=12)\n",
    "plt.title('PCA (Normalized Data): 3D Score Plot (PC1 vs PC2 vs PC3)', fontsize=16)\n",
    "\n",
    "# Loading plot in 3D\n",
    "for i, (x, y, z) in enumerate(zip(loading_scores[:,0], loading_scores[:,1], loading_scores[:,2])):\n",
    "    ax.text(x, y, z, quantitative_vars[i], color='g', fontsize=12)\n",
    "\n",
    "plt.show()\n",
    "\n",
    "# Variability Percentage\n",
    "variance_percentage = pca_normalized.explained_variance_ratio_\n",
    "for i, var in enumerate(variance_percentage):\n",
    "    print(f\"Variability of PC{i+1}: {var * 100:.2f}%\")\n",
    "\n",
    "# Calculate number of components to capture a certain percentage of variance\n",
    "target_variance = 1.0  # Capture 100% of variance\n",
    "cumulative_variance = np.cumsum(variance_percentage)\n",
    "n_components_needed = np.argmax(cumulative_variance >= target_variance) + 1\n",
    "print(f\"Number of components capturing {target_variance * 100}% of variance: {n_components_needed}\")\n",
    "\n",
    "# Save PCA results to CSV\n",
    "pca_results_df = pd.concat([df_pca_normalized, pd.DataFrame(data=loading_scores, columns=[f'Loading_PC{i+1}' for i in range(pca_normalized.n_components_)]),\n",
    "                            pd.DataFrame(data=variance_percentage, columns=['Variability_Percentage'])], axis=1)\n",
    "pca_results_df.to_csv('09pca_results.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 15. Hierarchical Clustering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Hierarchical clustering using the min-max normalized data\n",
    "linkage_matrix_normalized = sch.linkage(df_min_max[quantitative_vars], method='ward')\n",
    "\n",
    "# Plotting the hierarchical clustering as a dendrogram (Normalized Data)\n",
    "plt.figure(figsize=(10, 7))\n",
    "dendrogram = sch.dendrogram(linkage_matrix_normalized)\n",
    "plt.title('Hierarchical Clustering (Normalized Data): Dendrogram', fontsize=16)\n",
    "plt.xlabel('Samples', fontsize=12)\n",
    "plt.ylabel('Euclidean distances', fontsize=12)\n",
    "plt.show()\n",
    "\n",
    "# Creating a cluster map (heatmap with dendrograms) for the min-max normalized data\n",
    "sns.clustermap(df_min_max[quantitative_vars], cmap='coolwarm', figsize=(12, 12), method='ward', annot=True, fmt=\".2f\")\n",
    "plt.title('Cluster Map with Dendrograms (Normalized Data)', fontsize=16, pad=100)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 16. Data Normalization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Detect quantitative and categorical columns\n",
    "quantitative_columns = df.select_dtypes(include=['int', 'float']).columns\n",
    "categorical_columns = df.select_dtypes(include=['object', 'category']).columns\n",
    "\n",
    "# Select quantitative data for normalization\n",
    "Xmr = df[quantitative_columns]\n",
    "print(\"Selected quantitative columns:\", Xmr.columns.values.tolist())\n",
    "\n",
    "# Min-max normalization\n",
    "Xmr_min = Xmr.min()\n",
    "Xmr_max = Xmr.max()\n",
    "Xmnr = (Xmr - Xmr_min) / (Xmr_max - Xmr_min)\n",
    "\n",
    "# Select categorical data\n",
    "Xc = df[categorical_columns]\n",
    "print(\"Selected categorical columns:\", Xc.columns.values.tolist())\n",
    "\n",
    "# Combine normalized quantitative data with categorical data\n",
    "Xl = pd.concat([Xc, Xmnr], axis=1)\n",
    "\n",
    "# Show column names of the resulting DataFrame\n",
    "print(\"Columns after combining:\", Xl.columns.values.tolist())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 17. K-means Clustering and Heatmap"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from scipy.cluster.hierarchy import linkage\n",
    "from sklearn.cluster import KMeans\n",
    "\n",
    "# Assuming Xl is your DataFrame\n",
    "DT = Xl.copy()\n",
    "\n",
    "# Set index\n",
    "DT_index = DT.index\n",
    "\n",
    "# Identify the first categorical column\n",
    "categorical_columns = DT.select_dtypes(include=['object', 'category']).columns\n",
    "if len(categorical_columns) > 0:\n",
    "    group_column = categorical_columns[0]\n",
    "else:\n",
    "    raise ValueError(\"No categorical column found in the DataFrame.\")\n",
    "\n",
    "# Convert the group column to string to avoid mapping issues with categorical types\n",
    "DT[group_column] = DT[group_column].astype(str)\n",
    "\n",
    "# Standardize the data\n",
    "scaler = StandardScaler()\n",
    "quant_vars = DT.select_dtypes(include=['float64', 'int64']).columns\n",
    "DT_scaled = scaler.fit_transform(DT[quant_vars])\n",
    "\n",
    "# Perform hierarchical clustering\n",
    "row_clusters = linkage(DT_scaled, method='ward')\n",
    "col_clusters = linkage(DT_scaled.T, method='ward')\n",
    "\n",
    "# Create a clustered DataFrame\n",
    "clustered_df = pd.DataFrame(data=DT_scaled, columns=quant_vars, index=DT_index)\n",
    "clustered_df['Group'] = DT[group_column]  # Add the group information\n",
    "\n",
    "# Map groups to colors\n",
    "group_colors = {group: color for group, color in zip(DT[group_column].unique(), sns.color_palette(\"hls\", DT[group_column].nunique()))}\n",
    "row_colors = DT[group_column].map(group_colors)  # Apply color mapping\n",
    "\n",
    "# Create heatmap (Blue scale)\n",
    "g1 = sns.clustermap(clustered_df.drop('Group', axis=1), row_linkage=row_clusters, col_linkage=col_clusters,\n",
    "                    row_colors=row_colors, figsize=(12, 10), cmap='Blues', annot=False,\n",
    "                    row_cluster=True, col_cluster=True)\n",
    "plt.setp(g1.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)  # Rotate y-axis labels\n",
    "plt.savefig('heatmap_blue.png')\n",
    "\n",
    "# Create heatmap (Grey scale)\n",
    "g2 = sns.clustermap(clustered_df.drop('Group', axis=1), row_linkage=row_clusters, col_linkage=col_clusters,\n",
    "                    row_colors=row_colors, figsize=(12, 10), cmap='Greys', annot=False,\n",
    "                    row_cluster=True, col_cluster=True)\n",
    "plt.setp(g2.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)  # Rotate y-axis labels\n",
    "plt.savefig('heatmap_grey.png')\n",
    "\n",
    "# Get the order of rows and columns for Blue scale heatmap\n",
    "ordered_cols = clustered_df.drop('Group', axis=1).columns[g1.dendrogram_col.reordered_ind]  # Use reordered indices\n",
    "ordered_rows = clustered_df.index[g1.dendrogram_row.reordered_ind]  # Use reordered indices\n",
    "\n",
    "# Create a DataFrame with ordered samples and their cluster group for Blue scale heatmap\n",
    "ordered_sample_cluster_blue = pd.DataFrame({'Sample': ordered_rows, 'Cluster_Group': 'Sample'})\n",
    "# Save to CSV\n",
    "ordered_sample_cluster_blue.to_csv('sample_order_blue.csv', index=False)\n",
    "\n",
    "# Create a DataFrame with ordered variables and their cluster group for Blue scale heatmap\n",
    "ordered_variable_cluster_blue = pd.DataFrame({'Variable': ordered_cols, 'Cluster_Group': 'Variable'})\n",
    "# Save to CSV\n",
    "ordered_variable_cluster_blue.to_csv('variable_order_blue.csv', index=False)\n",
    "\n",
    "# Get the order of rows and columns for Grey scale heatmap\n",
    "ordered_cols_grey = clustered_df.drop('Group', axis=1).columns[g2.dendrogram_col.reordered_ind]  # Use reordered indices\n",
    "ordered_rows_grey = clustered_df.index[g2.dendrogram_row.reordered_ind]  # Use reordered indices\n",
    "\n",
    "# Create a DataFrame with ordered samples and their cluster group for Grey scale heatmap\n",
    "ordered_sample_cluster_grey = pd.DataFrame({'Sample': ordered_rows_grey, 'Cluster_Group': 'Sample'})\n",
    "# Save to CSV\n",
    "ordered_sample_cluster_grey.to_csv('sample_order_grey.csv', index=False)\n",
    "\n",
    "# Create a DataFrame with ordered variables and their cluster group for Grey scale heatmap\n",
    "ordered_variable_cluster_grey = pd.DataFrame({'Variable': ordered_cols_grey, 'Cluster_Group': 'Variable'})\n",
    "# Save to CSV\n",
    "ordered_variable_cluster_grey.to_csv('variable_order_grey.csv', index=False)\n",
    "\n",
    "# Add sample labels to the Blue scale heatmap\n",
    "for label in ordered_rows:\n",
    "    g1.ax_heatmap.annotate(label, xy=(0, ordered_rows.get_loc(label)), xytext=(-g1.ax_heatmap.yaxis.get_tick_padding(),\n",
    "                                                                                0),\n",
    "                           textcoords='offset points', va='center', ha='right', fontsize=8)\n",
    "\n",
    "# Apply K-means clustering\n",
    "kmeans = KMeans(n_clusters=4, random_state=0).fit(DT_scaled)\n",
    "DT['KMeans_Cluster'] = kmeans.labels_\n",
    "\n",
    "# Get cluster centroids\n",
    "centroids = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_), columns=quant_vars)\n",
    "\n",
    "# Add cluster label to centroids\n",
    "centroids['Cluster'] = centroids.index\n",
    "\n",
    "# Save the results to CSV\n",
    "DT.to_csv('kmeans_with_groups.csv', index=True)  # Index is True to keep the original index\n",
    "centroids.to_csv('kmeans_centroids.csv', index=False)  # Index is False since it's not relevant for centroids"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 18. PLS-DA Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.cross_decomposition import PLSRegression\n",
    "from sklearn.model_selection import StratifiedShuffleSplit\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "# Define the PLS-DA model and cross-validation function\n",
    "def perform_pls_cv(X, y, n_components, cv):\n",
    "    accuracies = []\n",
    "    for train_index, test_index in cv.split(X, y):\n",
    "        X_train, X_test = X[train_index], X[test_index]\n",
    "        y_train, y_test = y[train_index], y[test_index]\n",
    "\n",
    "        pls = PLSRegression(n_components=n_components)\n",
    "        pls.fit(X_train, y_train)\n",
    "        y_pred = pls.predict(X_test)\n",
    "        accuracies.append(accuracy_score(y_test, np.round(y_pred)))\n",
    "    \n",
    "    return np.mean(accuracies)\n",
    "\n",
    "# Detect quantitative and categorical columns\n",
    "quantitative_columns = Xl.select_dtypes(include=['int64', 'float64']).columns\n",
    "categorical_columns = Xl.select_dtypes(include=['object', 'category']).columns\n",
    "\n",
    "# Use the first categorical column as the target, if it exists\n",
    "if len(categorical_columns) > 0:\n",
    "    target_column = categorical_columns[0]\n",
    "else:\n",
    "    raise ValueError(\"No categorical columns found in the DataFrame for PLS-DA.\")\n",
    "\n",
    "# Separate features and target\n",
    "X = Xl[quantitative_columns]\n",
    "y = pd.Categorical(Xl[target_column]).codes\n",
    "\n",
    "# Standardize features\n",
    "scaler = StandardScaler()\n",
    "X_scaled = scaler.fit_transform(X)\n",
    "\n",
    "# Define cross-validation strategy\n",
    "cv = StratifiedShuffleSplit(n_splits=10, test_size=0.33, random_state=42)\n",
    "\n",
    "# Initialize an empty list to store DataFrame fragments\n",
    "results_list = []\n",
    "\n",
    "# Finding the optimal number of components\n",
    "max_components = min(X_scaled.shape[1], 15)\n",
    "optimal_components = 2\n",
    "max_accuracy = 0\n",
    "\n",
    "for n_components in range(2, max_components + 1):\n",
    "    accuracy = perform_pls_cv(X_scaled, y, n_components, cv)\n",
    "    results_list.append(pd.DataFrame({'Number_of_Components': [n_components], 'Average_Accuracy': [accuracy]}))\n",
    "\n",
    "# Concatenate all DataFrame fragments into a single DataFrame\n",
    "results_df = pd.concat(results_list, ignore_index=True)\n",
    "\n",
    "# Save the results to a CSV file\n",
    "results_df.to_csv('plsda_components_accuracy.csv', index=False)\n",
    "\n",
    "# Find the optimal number of components\n",
    "optimal_row = results_df.loc[results_df['Average_Accuracy'].idxmax()]\n",
    "optimal_components = optimal_row['Number_of_Components']\n",
    "\n",
    "# Print the optimal number of components\n",
    "print(f\"Optimal number of components: {optimal_components}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 19. PLS-DA Two-Class Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_vip(model, X):\n",
    "    \"\"\"\n",
    "    Calculate Variable Importance in Projection (VIP) scores for a fitted PLS model.\n",
    "    Args:\n",
    "        model: Fitted PLSRegression model\n",
    "        X: Original feature matrix used in training\n",
    "    Returns:\n",
    "        VIP scores for each feature\n",
    "    \"\"\"\n",
    "    t = model.x_scores_  # Scores (T)\n",
    "    w = model.x_weights_  # Weights (W)\n",
    "    q = model.y_loadings_  # Loadings (Q)\n",
    "\n",
    "    # Sum of squares of the scores (T^2) per component\n",
    "    p = np.sum(t ** 2, axis=0) * q.flatten() ** 2  # Weighted by loadings\n",
    "    w_squared = w ** 2  # Squared weights (features x components)\n",
    "\n",
    "    # Correct the broadcasting\n",
    "    vip_scores = np.sqrt(X.shape[1] * np.dot(w_squared, p) / np.sum(p))\n",
    "    return vip_scores\n",
    "\n",
    "# Standardize features\n",
    "scaler = StandardScaler()\n",
    "X_scaled = scaler.fit_transform(X)\n",
    "\n",
    "# Define cross-validation strategy\n",
    "cv = StratifiedShuffleSplit(n_splits=10, test_size=0.33, random_state=42)\n",
    "\n",
    "# Define PLS-DA model\n",
    "pls_da = PLSRegression(n_components=2)\n",
    "\n",
    "# Initialize lists to store metrics and VIP scores\n",
    "results = []\n",
    "vip_scores_list = []\n",
    "\n",
    "# Perform cross-validation\n",
    "for fold_idx, (train_index, test_index) in enumerate(cv.split(X_scaled, y)):\n",
    "    X_train, X_test = X_scaled[train_index], X_scaled[test_index]\n",
    "    y_train, y_test = y[train_index], y[test_index]\n",
    "\n",
    "    # Train PLS-DA model\n",
    "    pls_da.fit(X_train, y_train)\n",
    "\n",
    "    # Predict for training and testing sets\n",
    "    y_train_pred = np.clip(np.round(pls_da.predict(X_train)), 0, 1).astype(int).flatten()\n",
    "    y_test_pred = np.clip(np.round(pls_da.predict(X_test)), 0, 1).ast(int).flatten()\n",
    "\n",
    "    # Training metrics\n",
    "    train_cm = confusion_matrix(y_train, y_train_pred)\n",
    "    train_tn, train_fp, train_fn, train_tp = train_cm.ravel()\n",
    "    train_accuracy = accuracy_score(y_train, y_train_pred)\n",
    "    train_sensitivity = train_tp / (train_tp + train_fn) if (train_tp + train_fn) > 0 else 0\n",
    "    train_specificity = train_tn / (train_tn + train_fp) if (train_tn + train_fp) > 0 else 0\n",
    "\n",
    "    # Testing metrics\n",
    "    test_cm = confusion_matrix(y_test, y_test_pred)\n",
    "    test_tn, test_fp, test_fn, test_tp = test_cm.ravel()\n",
    "    test_accuracy = accuracy_score(y_test, y_test_pred)\n",
    "    test_sensitivity = test_tp / (test_tp + test_fn) if (test_tp + test_fn) > 0 else 0\n",
    "    test_specificity = test_tn / (test_tn + test_fp) if (test_tn + test_fp) > 0 else 0\n",
    "\n",
    "    # Calculate VIP scores for the current fold\n",
    "    vip_scores = calculate_vip(pls_da, X_train)\n",
    "    vip_scores_list.append(vip_scores)\n",
    "\n",
    "    # Append results\n",
    "    results.append({\n",
    "        'Fold': fold_idx + 1,\n",
    "        'Train Accuracy': train_accuracy,\n",
    "        'Train Sensitivity': train_sensitivity,\n",
    "        'Train Specificity': train_specificity,\n",
    "        'Train Confusion Matrix': train_cm.tolist(),\n",
    "        'Test Accuracy': test_accuracy,\n",
    "        'Test Sensitivity': test_sensitivity,\n",
    "        'Test Specificity': test_specificity,\n",
    "        'Test Confusion Matrix': test_cm.tolist()\n",
    "    })\n",
    "\n",
    "# Average VIP scores across folds\n",
    "average_vip_scores = np.mean(vip_scores_list, axis=0)\n",
    "\n",
    "# Combine results into a DataFrame\n",
    "results_df = pd.DataFrame(results)\n",
    "\n",
    "# Add VIP scores to the DataFrame\n",
    "for i, feature in enumerate(X.columns):\n",
    "    results_df[f'VIP_{feature}'] = average_vip_scores[i]\n",
    "\n",
    "# Save results to a CSV file\n",
    "results_df.to_csv('plsda_train_test_metrics_with_vip.csv', index=False)\n",
    "\n",
    "# Display results\n",
    "print(results_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## End of Analysis"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexifier": 2,
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}