Commit bb159330 authored by sanjay_pokkali's avatar sanjay_pokkali
Browse files

Fixed baseline and add card image

parent 7a60abf2
Pipeline #4066 failed with stages
in 33 seconds
......@@ -46,6 +46,12 @@
"execution_count": 0,
"metadata": {
"colab": {
<<<<<<< HEAD
"name": "ADCLK_baseline.ipynb",
"provenance": [],
"collapsed_sections": [],
"machine_shape": "hm"
=======
"base_uri": "https://localhost:8080/",
"height": 187
},
......@@ -70,6 +76,7 @@
"Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.6/dist-packages (from scikit-learn) (1.4.1)\n",
"Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn) (0.14.1)\n"
]
>>>>>>> 7a60abf2d27ed3f036748da561a8994767dee247
}
],
"source": [
......@@ -79,6 +86,1065 @@
"!{sys.executable} -m pip install scikit-learn"
]
},
<<<<<<< HEAD
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "ZtEKOsBu2wCX",
"colab_type": "text"
},
"source": [
"# Baseline for ADCLK Educational Challenge on AIcrowd\n",
"#### Author : Sanjay Pokkali"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Ds8sPZt-2wCY",
"colab_type": "text"
},
"source": [
"## To open this notebook on Google Computing platform Colab, click below!"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "lEB3fPzZ2wCY",
"colab_type": "text"
},
"source": [
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb)\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "iDNH3tML2wCZ",
"colab_type": "text"
},
"source": [
"## Download Necessary Packages"
]
},
{
"cell_type": "code",
"metadata": {
"collapsed": true,
"id": "iyjnts_i2wCZ",
"colab_type": "code",
"outputId": "b7ed6daa-5f51-43f6-82a8-337a678756a6",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 187
}
},
"source": [
"import sys\n",
"!{sys.executable} -m pip install numpy\n",
"!{sys.executable} -m pip install pandas\n",
"!{sys.executable} -m pip install scikit-learn"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (1.18.3)\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.6/dist-packages (1.0.3)\n",
"Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas) (2018.9)\n",
"Requirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas) (2.8.1)\n",
"Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.6/dist-packages (from pandas) (1.18.3)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil>=2.6.1->pandas) (1.12.0)\n",
"Requirement already satisfied: scikit-learn in /usr/local/lib/python3.6/dist-packages (0.22.2.post1)\n",
"Requirement already satisfied: numpy>=1.11.0 in /usr/local/lib/python3.6/dist-packages (from scikit-learn) (1.18.3)\n",
"Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.6/dist-packages (from scikit-learn) (1.4.1)\n",
"Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn) (0.14.1)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "hMUMGoZZ2wCd",
"colab_type": "text"
},
"source": [
"## Download data\n",
"The first step is to download out train test data. We will be training a classifier on the train data and make predictions on test data. We submit our predictions\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "KM91aXAw2wCe",
"colab_type": "code",
"outputId": "48d6bcbe-bfda-4ba2-bdb2-e943096f0bdc",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 391
}
},
"source": [
"#Donwload the datasets\n",
"!wget https://s3.eu-central-1.wasabisys.com/aicrowd-practice-challenges/public/adclk/v0.1/train.csv\n",
"!wget https://s3.eu-central-1.wasabisys.com/aicrowd-practice-challenges/public/adclk/v0.1/test.csv"
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"text": [
"--2020-05-16 13:56:36-- https://s3.eu-central-1.wasabisys.com/aicrowd-practice-challenges/public/adclk/v0.1/train.csv\n",
"Resolving s3.eu-central-1.wasabisys.com (s3.eu-central-1.wasabisys.com)... 130.117.252.13, 130.117.252.12, 130.117.252.16, ...\n",
"Connecting to s3.eu-central-1.wasabisys.com (s3.eu-central-1.wasabisys.com)|130.117.252.13|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 2409970 (2.3M) [text/csv]\n",
"Saving to: ‘train.csv’\n",
"\n",
"train.csv 100%[===================>] 2.30M 1.41MB/s in 1.6s \n",
"\n",
"2020-05-16 13:56:39 (1.41 MB/s) - ‘train.csv’ saved [2409970/2409970]\n",
"\n",
"--2020-05-16 13:56:41-- https://s3.eu-central-1.wasabisys.com/aicrowd-practice-challenges/public/adclk/v0.1/test.csv\n",
"Resolving s3.eu-central-1.wasabisys.com (s3.eu-central-1.wasabisys.com)... 130.117.252.13, 130.117.252.16, 130.117.252.12, ...\n",
"Connecting to s3.eu-central-1.wasabisys.com (s3.eu-central-1.wasabisys.com)|130.117.252.13|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 582921 (569K) [text/csv]\n",
"Saving to: ‘test.csv’\n",
"\n",
"test.csv 100%[===================>] 569.26K 494KB/s in 1.2s \n",
"\n",
"2020-05-16 13:56:43 (494 KB/s) - ‘test.csv’ saved [582921/582921]\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "U7H2PP692wCg",
"colab_type": "text"
},
"source": [
"\n",
"## Import packages"
]
},
{
"cell_type": "code",
"metadata": {
"id": "zv0sEu7z2wCg",
"colab_type": "code",
"colab": {}
},
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.neural_network import MLPClassifier\n",
"from sklearn.svm import SVC\n",
"from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,log_loss"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "Pc0brLwc2wCi",
"colab_type": "text"
},
"source": [
"## Load Data\n",
"We use pandas library to load our data. Pandas loads them into dataframes which helps us analyze our data easily. Learn more about it [here](https://www.tutorialspoint.com/python_data_science/python_pandas.htm)"
]
},
{
"cell_type": "code",
"metadata": {
"id": "eBwUmeV92wCj",
"colab_type": "code",
"colab": {}
},
"source": [
"train_data_path = \"train.csv\" #path where data is stored"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Mugz2FSA2wCm",
"colab_type": "code",
"colab": {}
},
"source": [
"train_data = pd.read_csv(train_data_path) #load data in dataframe using pandas"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "NMtGgNBv2wCo",
"colab_type": "text"
},
"source": [
"## Visualize the data"
]
},
{
"cell_type": "code",
"metadata": {
"id": "mEKcS2Lr2wCp",
"colab_type": "code",
"outputId": "209a9541-3cbd-4210-8ca9-584ade35e3f4",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
}
},
"source": [
"train_data.head()"
],
"execution_count": 22,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>click</th>\n",
" <th>impression</th>\n",
" <th>url_hash</th>\n",
" <th>ad_id</th>\n",
" <th>advertiser_id</th>\n",
" <th>depth</th>\n",
" <th>position</th>\n",
" <th>query_id</th>\n",
" <th>keyword_id</th>\n",
" <th>title_id</th>\n",
" <th>description_id</th>\n",
" <th>user_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>5.660000e+18</td>\n",
" <td>21442160</td>\n",
" <td>37070</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1430</td>\n",
" <td>4232</td>\n",
" <td>889814</td>\n",
" <td>712389</td>\n",
" <td>249</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>9.750000e+18</td>\n",
" <td>10850149</td>\n",
" <td>29713</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>2457</td>\n",
" <td>503</td>\n",
" <td>1904</td>\n",
" <td>2155</td>\n",
" <td>15084327</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>9.570000e+18</td>\n",
" <td>1973398</td>\n",
" <td>1339</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2088431</td>\n",
" <td>675</td>\n",
" <td>450</td>\n",
" <td>750</td>\n",
" <td>1300996</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1.660000e+18</td>\n",
" <td>21248222</td>\n",
" <td>2298</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>7151787</td>\n",
" <td>2167</td>\n",
" <td>4258</td>\n",
" <td>4840</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2.040000e+18</td>\n",
" <td>21194514</td>\n",
" <td>34292</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>9133331</td>\n",
" <td>11660</td>\n",
" <td>59369</td>\n",
" <td>53523</td>\n",
" <td>3047683</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" click impression url_hash ... title_id description_id user_id\n",
"0 0 1 5.660000e+18 ... 889814 712389 249\n",
"1 0 1 9.750000e+18 ... 1904 2155 15084327\n",
"2 0 1 9.570000e+18 ... 450 750 1300996\n",
"3 0 2 1.660000e+18 ... 4258 4840 0\n",
"4 0 1 2.040000e+18 ... 59369 53523 3047683\n",
"\n",
"[5 rows x 12 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 22
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "bubtKctQ2wCr",
"colab_type": "text"
},
"source": [
"We can see the dataset contains 12 columns,where columns 2-12 denotes the information about the person that is called and the first column tell whether he clicked on the ad or not."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "zTHwDehiuQSv",
"colab_type": "text"
},
"source": [
"Let us now pre-process the data to remove any unwanted columns. We remove url_hash and advertiser_id"
]
},
{
"cell_type": "code",
"metadata": {
"id": "9fKmki4WYrMk",
"colab_type": "code",
"outputId": "dde26fa8-6db3-4c58-fef2-ab71e80a9f3b",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 419
}
},
"source": [
"train_data.drop([\"url_hash\",\"advertiser_id\"],axis=1,inplace=True)\n",
"train_data"
],
"execution_count": 23,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>click</th>\n",
" <th>impression</th>\n",
" <th>ad_id</th>\n",
" <th>depth</th>\n",
" <th>position</th>\n",
" <th>query_id</th>\n",
" <th>keyword_id</th>\n",
" <th>title_id</th>\n",
" <th>description_id</th>\n",
" <th>user_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>21442160</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1430</td>\n",
" <td>4232</td>\n",
" <td>889814</td>\n",
" <td>712389</td>\n",
" <td>249</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>10850149</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>2457</td>\n",
" <td>503</td>\n",
" <td>1904</td>\n",
" <td>2155</td>\n",
" <td>15084327</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1973398</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2088431</td>\n",
" <td>675</td>\n",
" <td>450</td>\n",
" <td>750</td>\n",
" <td>1300996</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>21248222</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>7151787</td>\n",
" <td>2167</td>\n",
" <td>4258</td>\n",
" <td>4840</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>21194514</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>9133331</td>\n",
" <td>11660</td>\n",
" <td>59369</td>\n",
" <td>53523</td>\n",
" <td>3047683</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39995</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>20370275</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>4747974</td>\n",
" <td>12830</td>\n",
" <td>1459695</td>\n",
" <td>25167</td>\n",
" <td>4224468</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39996</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4222402</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>162741</td>\n",
" <td>2868</td>\n",
" <td>280203</td>\n",
" <td>14903</td>\n",
" <td>3847328</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39997</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>21500376</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>65</td>\n",
" <td>119</td>\n",
" <td>1118</td>\n",
" <td>1319</td>\n",
" <td>6388244</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39998</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>21162526</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>806</td>\n",
" <td>10</td>\n",
" <td>24</td>\n",
" <td>25</td>\n",
" <td>8971395</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39999</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1763269</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>9650</td>\n",
" <td>16840</td>\n",
" <td>36969</td>\n",
" <td>40258</td>\n",
" <td>756647</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>40000 rows × 10 columns</p>\n",
"</div>"
],
"text/plain": [
" click impression ad_id ... title_id description_id user_id\n",
"0 0 1 21442160 ... 889814 712389 249\n",
"1 0 1 10850149 ... 1904 2155 15084327\n",
"2 0 1 1973398 ... 450 750 1300996\n",
"3 0 2 21248222 ... 4258 4840 0\n",
"4 0 1 21194514 ... 59369 53523 3047683\n",
"... ... ... ... ... ... ... ...\n",
"39995 0 2 20370275 ... 1459695 25167 4224468\n",
"39996 0 1 4222402 ... 280203 14903 3847328\n",
"39997 0 1 21500376 ... 1118 1319 6388244\n",
"39998 1 1 21162526 ... 24 25 8971395\n",
"39999 1 2 1763269 ... 36969 40258 756647\n",
"\n",
"[40000 rows x 10 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 23
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "tN9sICA62wCs",
"colab_type": "text"
},
"source": [
"## Spl