Binary text classification

Related tutorials:

Text classification in Tensorflow

Code:

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"
 
train_df = pd.read_csv(train_file_path, sep = '\t')
test_df = pd.read_csv(test_file_path, sep = '\t')
 
tam = train_df['ham'].value_counts()[0]-train_df['ham'].value_counts()[1]
ham_df1 = train_df[train_df['ham']=='ham']
spam_df1 = train_df[train_df['ham']=='spam']
ham_df1 = ham_df1.drop(ham_df1.index[:tam])
train_df = ham_df1.append(spam_df1).reset_index(drop=True)
 
tam = train_df['ham'].value_counts()[0]-test_df['ham'].value_counts()[1]
ham_df2 = test_df[test_df['ham']=='ham']
spam_df2 = test_df[test_df['ham']=='spam']
ham_df2 = ham_df2.drop(ham_df2.index[:tam])
test_df = ham_df2.append(spam_df2).reset_index(drop=True)
 
train_dataframe = train_df.copy()
train_labels = train_dataframe.pop('ham').astype('category').cat.codes
test_dataframe = test_df.copy()
test_labels = test_dataframe.pop('ham').astype('category').cat.codes
 
train_dataframe.columns = ['msg']
test_dataframe.columns = ['msg']
 
train_series = train_dataframe['msg']
test_series = test_dataframe['msg']
 
max_len = 50
trunc_type = "post"
padding_type = "post"
oov_tok = "<OOV>"
vocab_size = 500
embeding_dim = 16
drop_value = 0.2
n_dense = 24
 
# the tokenizer part to make numbers from words
tokenizer = keras.preprocessing.text.Tokenizer(num_words = vocab_size, char_level=False, oov_token=oov_tok)
tokenizer.fit_on_texts(train_series)
 
training_sequences = tokenizer.texts_to_sequences(train_series)
training_padded = keras.preprocessing.sequence.pad_sequences(training_sequences, maxlen = max_len, padding = padding_type,truncating = trunc_type)
 
testing_sequences = tokenizer.texts_to_sequences(test_series)
testing_padded = keras.preprocessing.sequence.pad_sequences(testing_sequences, maxlen = max_len, padding = padding_type,truncating = trunc_type)
 
# here we initialize the training
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, embeding_dim, input_length=max_len))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(24, activation='relu'))
model.add(keras.layers.Dropout(drop_value))
model.add(keras.layers.Dense(1, activation='sigmoid'))
 
model.compile(loss='binary_crossentropy',optimizer='adam' ,metrics=['accuracy'])
 
# train the model
model.fit(training_padded, train_labels, epochs=30, validation_data=(testing_padded, test_labels))
 
def predict_message(pred_text):
  tk_txt = tokenizer.texts_to_sequences([pred_text])
  pad_txt = keras.preprocessing.sequence.pad_sequences(tk_txt, maxlen = max_len, padding = padding_type, = trunc_type)
  if(round((model.predict(pad_txt))[0][0]) == 1):  
    return [(model.predict(pad_txt))[0][0], 'spam']
  return [(model.predict(pad_txt))[0][0], 'ham']
 
pred_text = "how are you doing today?"
 
prediction = predict_message(pred_text)
print(prediction)

Imports:

import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt