{ "cells": [ { "cell_type": "markdown", "id": "c5aaedf3", "metadata": {}, "source": [ "
| \n", " | sepal_length | \n", "sepal_width | \n", "petal_length | \n", "petal_width | \n", "type | \n", "label | \n", "
|---|---|---|---|---|---|---|
| 0 | \n", "5.1 | \n", "3.5 | \n", "1.4 | \n", "0.2 | \n", "setosa | \n", "0 | \n", "
| 1 | \n", "4.9 | \n", "3.0 | \n", "1.4 | \n", "0.2 | \n", "setosa | \n", "0 | \n", "
| 2 | \n", "4.7 | \n", "3.2 | \n", "1.3 | \n", "0.2 | \n", "setosa | \n", "0 | \n", "
| 3 | \n", "4.6 | \n", "3.1 | \n", "1.5 | \n", "0.2 | \n", "setosa | \n", "0 | \n", "
| 4 | \n", "5.0 | \n", "3.6 | \n", "1.4 | \n", "0.2 | \n", "setosa | \n", "0 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 145 | \n", "6.7 | \n", "3.0 | \n", "5.2 | \n", "2.3 | \n", "virginica | \n", "2 | \n", "
| 146 | \n", "6.3 | \n", "2.5 | \n", "5.0 | \n", "1.9 | \n", "virginica | \n", "2 | \n", "
| 147 | \n", "6.5 | \n", "3.0 | \n", "5.2 | \n", "2.0 | \n", "virginica | \n", "2 | \n", "
| 148 | \n", "6.2 | \n", "3.4 | \n", "5.4 | \n", "2.3 | \n", "virginica | \n", "2 | \n", "
| 149 | \n", "5.9 | \n", "3.0 | \n", "5.1 | \n", "1.8 | \n", "virginica | \n", "2 | \n", "
150 rows × 6 columns
\n", "| \n", " | interarrival_std | \n", "interarrival_mean | \n", "interarrival_min | \n", "interarrival_max | \n", "interarrival_max_min_diff | \n", "interarrival_p10 | \n", "interarrival_p20 | \n", "interarrival_p25 | \n", "interarrival_p30 | \n", "interarrival_p40 | \n", "... | \n", "rtp_interarrival_max_min_R | \n", "rtp_interarrival_kurtosis | \n", "rtp_interarrival_skew | \n", "rtp_interarrival_moment3 | \n", "rtp_interarrival_moment4 | \n", "rtp_interarrival_len_unique_percent | \n", "rtp_interarrival_max_value_count_percent | \n", "rtp_interarrival_min_max_R | \n", "rtp_marker_sum_check | \n", "label | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "0.001927 | \n", "0.010000 | \n", "0.004951 | \n", "0.014423 | \n", "0.009472 | \n", "7.619953e-05 | \n", "8.045912e-05 | \n", "8.572698e-05 | \n", "9.030223e-05 | \n", "9.799051e-05 | \n", "... | \n", "0.500000 | \n", "-3.000000 | \n", "0.000000 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.010000 | \n", "1.000000 | \n", "0.500000 | \n", "0 | \n", "Audio | \n", "
| 1 | \n", "0.000515 | \n", "0.020009 | \n", "0.019227 | \n", "0.021251 | \n", "0.002024 | \n", "1.931565e-04 | \n", "1.953020e-04 | \n", "1.958430e-04 | \n", "1.965890e-04 | \n", "1.985469e-04 | \n", "... | \n", "0.500000 | \n", "-3.000000 | \n", "0.000000 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.020000 | \n", "1.000000 | \n", "0.500000 | \n", "0 | \n", "Audio | \n", "
| 2 | \n", "0.041315 | \n", "0.019994 | \n", "0.000000 | \n", "0.143393 | \n", "0.143393 | \n", "9.536743e-09 | \n", "9.536743e-09 | \n", "9.536743e-09 | \n", "1.907349e-08 | \n", "4.053116e-08 | \n", "... | \n", "0.500000 | \n", "-3.000000 | \n", "0.000000 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.019231 | \n", "1.000000 | \n", "0.500000 | \n", "0 | \n", "Audio | \n", "
| 3 | \n", "0.008119 | \n", "0.019954 | \n", "0.000873 | \n", "0.044432 | \n", "0.043559 | \n", "9.701633e-05 | \n", "1.477895e-04 | \n", "1.699674e-04 | \n", "1.779909e-04 | \n", "1.895509e-04 | \n", "... | \n", "0.500000 | \n", "-3.000000 | \n", "0.000000 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.020000 | \n", "1.000000 | \n", "0.500000 | \n", "0 | \n", "Audio | \n", "
| 4 | \n", "0.018683 | \n", "0.020117 | \n", "0.000001 | \n", "0.121093 | \n", "0.121092 | \n", "1.023531e-05 | \n", "7.453918e-05 | \n", "1.209468e-04 | \n", "1.324451e-04 | \n", "1.531601e-04 | \n", "... | \n", "0.500000 | \n", "-3.000000 | \n", "0.000000 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.021739 | \n", "1.000000 | \n", "0.500000 | \n", "0 | \n", "Audio | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 139995 | \n", "0.000799 | \n", "0.337698 | \n", "0.336812 | \n", "0.338365 | \n", "0.001553 | \n", "3.370330e-03 | \n", "3.372540e-03 | \n", "3.373646e-03 | \n", "3.374751e-03 | \n", "3.376961e-03 | \n", "... | \n", "0.511905 | \n", "-1.500000 | \n", "-0.707107 | \n", "-2.211840e+08 | \n", "3.185050e+11 | \n", "0.666667 | \n", "0.666667 | \n", "0.488095 | \n", "3 | \n", "ScreenSharing | \n", "
| 139996 | \n", "0.159892 | \n", "0.239946 | \n", "0.000108 | \n", "0.320163 | \n", "0.320055 | \n", "9.596729e-04 | \n", "1.918266e-03 | \n", "2.397562e-03 | \n", "2.876859e-03 | \n", "3.196862e-03 | \n", "... | \n", "1.000000 | \n", "-0.671026 | \n", "-1.148811 | \n", "-2.524719e+12 | \n", "6.654528e+16 | \n", "1.000000 | \n", "0.250000 | \n", "0.000000 | \n", "3 | \n", "ScreenSharing | \n", "
| 139997 | \n", "0.045574 | \n", "0.040176 | \n", "0.000012 | \n", "0.151814 | \n", "0.151802 | \n", "1.705837e-05 | \n", "3.843689e-05 | \n", "6.171942e-05 | \n", "1.125135e-04 | \n", "2.727780e-04 | \n", "... | \n", "0.500000 | \n", "-3.000000 | \n", "0.000000 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.043478 | \n", "1.000000 | \n", "0.500000 | \n", "23 | \n", "ScreenSharing | \n", "
| 139998 | \n", "0.028728 | \n", "0.325410 | \n", "0.299745 | \n", "0.356444 | \n", "0.056699 | \n", "3.038041e-03 | \n", "3.078630e-03 | \n", "3.098925e-03 | \n", "3.119220e-03 | \n", "3.159810e-03 | \n", "... | \n", "0.511144 | \n", "-1.500000 | \n", "-0.695813 | \n", "-1.628640e+08 | \n", "2.163721e+11 | \n", "1.000000 | \n", "0.333333 | \n", "0.488856 | \n", "3 | \n", "ScreenSharing | \n", "
| 139999 | \n", "0.004189 | \n", "0.040222 | \n", "0.032511 | \n", "0.049401 | \n", "0.016890 | \n", "3.474479e-04 | \n", "3.678946e-04 | \n", "3.826904e-04 | \n", "3.873811e-04 | \n", "3.936524e-04 | \n", "... | \n", "0.500000 | \n", "-3.000000 | \n", "0.000000 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "0.040000 | \n", "1.000000 | \n", "0.500000 | \n", "25 | \n", "ScreenSharing | \n", "
140000 rows × 96 columns
\n", "" ], "text/plain": [ " interarrival_std interarrival_mean interarrival_min \\\n", "0 0.001927 0.010000 0.004951 \n", "1 0.000515 0.020009 0.019227 \n", "2 0.041315 0.019994 0.000000 \n", "3 0.008119 0.019954 0.000873 \n", "4 0.018683 0.020117 0.000001 \n", "... ... ... ... \n", "139995 0.000799 0.337698 0.336812 \n", "139996 0.159892 0.239946 0.000108 \n", "139997 0.045574 0.040176 0.000012 \n", "139998 0.028728 0.325410 0.299745 \n", "139999 0.004189 0.040222 0.032511 \n", "\n", " interarrival_max interarrival_max_min_diff interarrival_p10 \\\n", "0 0.014423 0.009472 7.619953e-05 \n", "1 0.021251 0.002024 1.931565e-04 \n", "2 0.143393 0.143393 9.536743e-09 \n", "3 0.044432 0.043559 9.701633e-05 \n", "4 0.121093 0.121092 1.023531e-05 \n", "... ... ... ... \n", "139995 0.338365 0.001553 3.370330e-03 \n", "139996 0.320163 0.320055 9.596729e-04 \n", "139997 0.151814 0.151802 1.705837e-05 \n", "139998 0.356444 0.056699 3.038041e-03 \n", "139999 0.049401 0.016890 3.474479e-04 \n", "\n", " interarrival_p20 interarrival_p25 interarrival_p30 \\\n", "0 8.045912e-05 8.572698e-05 9.030223e-05 \n", "1 1.953020e-04 1.958430e-04 1.965890e-04 \n", "2 9.536743e-09 9.536743e-09 1.907349e-08 \n", "3 1.477895e-04 1.699674e-04 1.779909e-04 \n", "4 7.453918e-05 1.209468e-04 1.324451e-04 \n", "... ... ... ... \n", "139995 3.372540e-03 3.373646e-03 3.374751e-03 \n", "139996 1.918266e-03 2.397562e-03 2.876859e-03 \n", "139997 3.843689e-05 6.171942e-05 1.125135e-04 \n", "139998 3.078630e-03 3.098925e-03 3.119220e-03 \n", "139999 3.678946e-04 3.826904e-04 3.873811e-04 \n", "\n", " interarrival_p40 ... rtp_interarrival_max_min_R \\\n", "0 9.799051e-05 ... 0.500000 \n", "1 1.985469e-04 ... 0.500000 \n", "2 4.053116e-08 ... 0.500000 \n", "3 1.895509e-04 ... 0.500000 \n", "4 1.531601e-04 ... 0.500000 \n", "... ... ... ... \n", "139995 3.376961e-03 ... 0.511905 \n", "139996 3.196862e-03 ... 1.000000 \n", "139997 2.727780e-04 ... 0.500000 \n", "139998 3.159810e-03 ... 0.511144 \n", "139999 3.936524e-04 ... 0.500000 \n", "\n", " rtp_interarrival_kurtosis rtp_interarrival_skew \\\n", "0 -3.000000 0.000000 \n", "1 -3.000000 0.000000 \n", "2 -3.000000 0.000000 \n", "3 -3.000000 0.000000 \n", "4 -3.000000 0.000000 \n", "... ... ... \n", "139995 -1.500000 -0.707107 \n", "139996 -0.671026 -1.148811 \n", "139997 -3.000000 0.000000 \n", "139998 -1.500000 -0.695813 \n", "139999 -3.000000 0.000000 \n", "\n", " rtp_interarrival_moment3 rtp_interarrival_moment4 \\\n", "0 0.000000e+00 0.000000e+00 \n", "1 0.000000e+00 0.000000e+00 \n", "2 0.000000e+00 0.000000e+00 \n", "3 0.000000e+00 0.000000e+00 \n", "4 0.000000e+00 0.000000e+00 \n", "... ... ... \n", "139995 -2.211840e+08 3.185050e+11 \n", "139996 -2.524719e+12 6.654528e+16 \n", "139997 0.000000e+00 0.000000e+00 \n", "139998 -1.628640e+08 2.163721e+11 \n", "139999 0.000000e+00 0.000000e+00 \n", "\n", " rtp_interarrival_len_unique_percent \\\n", "0 0.010000 \n", "1 0.020000 \n", "2 0.019231 \n", "3 0.020000 \n", "4 0.021739 \n", "... ... \n", "139995 0.666667 \n", "139996 1.000000 \n", "139997 0.043478 \n", "139998 1.000000 \n", "139999 0.040000 \n", "\n", " rtp_interarrival_max_value_count_percent rtp_interarrival_min_max_R \\\n", "0 1.000000 0.500000 \n", "1 1.000000 0.500000 \n", "2 1.000000 0.500000 \n", "3 1.000000 0.500000 \n", "4 1.000000 0.500000 \n", "... ... ... \n", "139995 0.666667 0.488095 \n", "139996 0.250000 0.000000 \n", "139997 1.000000 0.500000 \n", "139998 0.333333 0.488856 \n", "139999 1.000000 0.500000 \n", "\n", " rtp_marker_sum_check label \n", "0 0 Audio \n", "1 0 Audio \n", "2 0 Audio \n", "3 0 Audio \n", "4 0 Audio \n", "... ... ... \n", "139995 3 ScreenSharing \n", "139996 3 ScreenSharing \n", "139997 23 ScreenSharing \n", "139998 3 ScreenSharing \n", "139999 25 ScreenSharing \n", "\n", "[140000 rows x 96 columns]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"RTP_dataset.csv\")\n", "df" ] }, { "cell_type": "markdown", "id": "0ac3650a", "metadata": {}, "source": [ "### 2.2 Binary classification\n", "From now on, we focus on two major classes, **Video** and **Audio**, and you need to develop ML pipeline to classify the traffic based on statistical features. Specifically, you will perform the following steps:\n", "- Data preprocessing\n", "- Model development (perform ERM with an algorithm)\n", "- Performance evaluation" ] }, { "cell_type": "markdown", "id": "bb0eab26", "metadata": {}, "source": [ "### 2.2.1 Dataset preprocessing - Data split and standardization\n", "- Extract data only associated to the aforementioned classes.\n", "- For an individual class, assign a numerical label (0 to Video and 1 to Audio).\n", "- Split the whole dataset into training and test. Stratify the split, keeping the 70/30 proportion (i.e., the training dataset contains the 70% of the sample per label, the test contains the remaining 30% per label).\n", "- After the splitting, standardize the data (features). Fit the StandardScaler only on the training set and then transform both the training and test sets. From now on, you will use the same standardize datasets for all the experiments." ] }, { "cell_type": "code", "execution_count": null, "id": "9ff3ad91", "metadata": {}, "outputs": [], "source": [ "# This part is provided\n", "# You can simply run this cell\n", "\n", "# extract data from Video and Audio\n", "# we have to perform a copy of the dataset otherwise we will modify the original dataset\n", "\n", "video = ['FEC-Video', 'HighQ', 'LowQ', 'MediumQ']\n", "audio = ['Audio', 'FEC-Audio']\n", "screen = ['ScreenSharing']\n", "\n", "video_data = df[df[\"label\"].isin(video)].copy()\n", "audio_data = df[df[\"label\"].isin(audio)].copy()\n", "\n", "video_data[\"binary_label\"]=0\n", "audio_data[\"binary_label\"]=1\n", "\n", "video_data = video_data.drop(\"label\",axis=1)\n", "audio_data = audio_data.drop(\"label\",axis=1)\n", "\n", "binary_dataset = pd.concat([video_data, audio_data])\n", "\n", "# prepare the new dataset\n", "# get the X and y from the dataset\n", "X = binary_dataset.drop(columns=['binary_label']).to_numpy()\n", "y = binary_dataset[['binary_label']].to_numpy()" ] }, { "cell_type": "code", "execution_count": null, "id": "7e1cf5e9", "metadata": {}, "outputs": [], "source": [ "# your answer here\n", "\n", "# run stratified training-test splitting using train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " X,\n", " y,\n", " stratify = y, # stratify the dataset based on class labels\n", " train_size = 0.7, # percentage of training set\n", " random_state = 42\n", ")\n", "\n", "# standardize data using StandardScaler\n", "scaler = StandardScaler()\n", "X_train_s = scaler.fit_transform(X_train, y_train)\n", "X_test_s = scaler.transform(X_test)" ] }, { "cell_type": "markdown", "id": "2ff6a273", "metadata": {}, "source": [ "### 2.2.2 Dataset preprocessing - Removal of correlated features\n", "- For the training set, compute and display the correlation matrix between the features (refer to lab 2 for details).\n", "- Remove strongly correlated features from both training and test sets, i.e., features having a correlation > 0.8. Note that a feature may be strongly correlated with many others.\n", " - How many correlated features you have to remove?" ] }, { "cell_type": "code", "execution_count": 56, "id": "280530a5", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "RandomForestClassifier(n_estimators=30)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(n_estimators=30)
DecisionTreeRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeRegressor()