Kaggle ML

就是統測完後寫的一些code

1. Data Science Job Salaries

Dataset: Link

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

data = pd.read_csv("/content/ds_salaries.csv")
feature = ["experience_level", "employment_type", "salary_in_usd", "company_size", "salary"]

le = LabelEncoder()
for col in data[feature]:
  data[col] = le.fit_transform(data[col])

y = data["salary"]
data = data[feature]

x_train, x_test, y_train, y_test = train_test_split(data, y,  test_size=0.2, random_state=1)
model = RandomForestRegressor(n_estimators=200, random_state=1)
model.fit(x_train, y_train)
model.score(x_test, y_test)
  

2. Titanic Survival Prediction

Dataset: Link

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

le = LabelEncoder()
for i in ["Sex", "Age", "Ticket", "Cabin"]:
  train_data[i] = le.fit_transform(train_data[i])
  test_data[i] = le.fit_transform(test_data[i])

x_train = train_data[["Sex", "Age", "Ticket", "Cabin"]]
y_train = train_data["Survived"]
x_test = test_data[["Sex", "Age", "Ticket", "Cabin"]]

model = RandomForestClassifier(n_estimators=200, random_state=1)
model.fit(x_train, y_train)
prediction = model.predict(x_test)

submission = pd.DataFrame({"PassengerId":test_data["PassengerId"], "Survived":prediction})
submission.to_csv("./submission.csv", index=False)
  

3. SMS Spam Classification

Dataset: Link

import pandas as pd
from sklearn.model_selection import train_test_split
from nltk import sent_tokenize, word_tokenize
import nltk
from nltk.corpus import stopwords

nltk.download("punkt")
nltk.download("stopwords")

data_train = pd.read_csv('SMSCollection.csv', encoding = "ISO-8859-1")

data_train

x_train, x_test, y_train, y_test = train_test_split(data_train.sms, data_train.Class, test_size = 0.2, random_state = 1)
x_train

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=2000)
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.fit_transform(x_test)

from sklearn import feature_extraction, linear_model
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
clf = MultinomialNB().fit(x_train, y_train)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(metrics.accuracy_score(y_pred, y_test))
  

4. Intel Image Classification

Dataset: Link

from google.colab import files
import os

if(os.path.exists("kaggle.json") == False):
  uploaded = files.upload()  # 上傳kaggle.json
  os.mkdir("/root/.kaggle/")
  with open("/root/.kaggle/kaggle.json", 'wb') as f:
    f.write(uploaded['kaggle.json'])

!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d puneet6060/intel-image-classification --unzip

import tensorflow as tf
import numpy as np
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Input, Add, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, AveragePooling2D, MaxPooling2D
from tensorflow.keras.initializers import random_normal, glorot_uniform
from keras import backend as K
import keras
from keras.models import Model, Sequential

train_data_dir = "/content/seg_train"
test_data_dir = "/content/seg_test"
image_generator = ImageDataGenerator(rescale = 1/255, validation_split = 0.2)
train_data = image_generator.flow_from_directory(batch_size = 32,
                          directory = train_data_dir,
                          target_size = (224, 224),
                          subset = "training",
                          class_mode = "categorical"
                        )

val_data = image_generator.flow_from_directory(batch_size = 32,
                         directory = train_data_dir,
                         target_size = (224, 224),
                         subset = "training",
                         class_mode = "categorical"
                        )
test_gen = ImageDataGenerator(rescale = 1/255)
test_data = test_gen.flow_from_directory(test_data_dir, target_size = (224, 224), batch_size = 32, shuffle = False)
print(train_data.classes)
print(val_data.classes)
print(train_data.class_indices)
print(test_data.classes)
class_names = np.unique(train_data.classes)

def identity_block(x, f, filters):
  F1, F2, F3 = filters
  x_shortcut = x
  x = Conv2D(filters = F1, kernel_size = (1,1), strides = (1,1), padding = "valid", 
         kernel_initializer = glorot_uniform(seed = 1))(x)
  x = BatchNormalization(axis = 3)(x)
  x = Activation("relu")(x)
  x = Conv2D(filters = F2, kernel_size = (1,1), strides = (1,1), padding ="same",
         kernel_initializer = glorot_uniform(seed = 1))(x)
  x = BatchNormalization(axis = 3)(x)
  x = Activation("relu")(x)
  x = Conv2D(filters = F3, kernel_size = (1,1), strides = (1,1), padding = "valid",
         kernel_initializer = glorot_uniform(seed = 1))(x)
  x = BatchNormalization(axis = 3)(x)
  x = Add()([x, x_shortcut])
  x = Activation("relu")(x)
  return x

def convolutional_block(x, f, filters, s):
  F1, F2, F3 = filters

  x_shortcut = x

  x = Conv2D(F1, (1,1), strides = (s,s), kernel_initializer = glorot_uniform(seed = 1))(x)
  x = BatchNormalization(axis = 3)(x)
  x = Activation("relu")(x)

  x = Conv2D(F2, (f,f), strides = (1,1), padding = "same", kernel_initializer = glorot_uniform(seed = 1))(x)
  x = BatchNormalization(axis = 3)(x)
  x = Activation("relu")(x)
  
  x = Conv2D(F3, (1,1), strides = (1,1), padding = "valid", kernel_initializer = glorot_uniform(seed = 1))(x)
  x = BatchNormalization(axis = 3)(x)
  
  x_shortcut = Conv2D(F3, (1,1), strides = (s,s), padding = "valid", kernel_initializer = glorot_uniform(seed = 1))(x_shortcut)
  x_shortcut = BatchNormalization(axis = 3)(x_shortcut)
  x = Add()([x, x_shortcut])
  x = Activation('relu')(x)
  return x

def ResNet50(input_shape = (224, 224, 3), classes = 6):
  x_input = Input(input_shape)

  x = ZeroPadding2D((3,3))(x_input)

  x = Conv2D(64, (7,7), strides = (2,2), kernel_initializer = glorot_uniform(seed = 1))(x)
  x = BatchNormalization(axis = 3)(x)
  x = Activation("relu")(x)
  x = MaxPooling2D((3,3), strides = (2,2))(x)

  x = convolutional_block(x, f=3, filters = [64, 64, 256], s = 1)
  x = identity_block(x, 3, [64, 64, 256])
  x = identity_block(x, 3, [64, 64, 256])

  x = convolutional_block(x, f=3, filters = [128, 128, 512], s = 2)
  x = identity_block(x, 3, [128, 128, 512])
  x = identity_block(x, 3, [128, 128, 512])
  x = identity_block(x, 3, [128, 128, 512])

  x = convolutional_block(x, f=3, filters = [256, 256, 1024], s = 2)
  x = identity_block(x, 3, [256, 256, 1024])
  x = identity_block(x, 3, [256, 256, 1024])
  x = identity_block(x, 3, [256, 256, 1024])
  x = identity_block(x, 3, [256, 256, 1024])
  x = identity_block(x, 3, [256, 256, 1024])

  x = convolutional_block(x, f=3, filters = [512, 512, 2048], s = 2)
  x = identity_block(x, 3, [512, 512, 2048])
  x = identity_block(x, 3, [512, 512, 2048])

  x = AveragePooling2D()(x)

  x = Flatten()(x)
  x = Dense(6, activation='softmax', kernel_initializer = glorot_uniform(seed=1))(x)

  model = Model(inputs = x_input, outputs = x)

  return model


model = ResNet50()
print(model.summary())