This is project to introduce myself to NLP in R.
With Kaggle disaster tweets dataset:
Natural Language Processing with Disaster Tweets | Kaggle
tweets <- read_csv('./train.csv', show_col_types = FALSE)
glimpse(tweets)
## Rows: 7,613
## Columns: 5
## $ id <dbl> 1, 4, 5, 6, 7, 8, 10, 13, 14, 15, 16, 17, 18, 19, 20, 23, 24,…
## $ keyword <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ location <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ text <chr> "Our Deeds are the Reason of this #earthquake May ALLAH Forgi…
## $ target <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0…
mean(tweets$target)
## [1] 0.4296598
This is close enough to 0.5 that I will still use an accuracy score. If it becomes a problem I might use f1 score.
I will use object type corpus from the tm package to process the data. A corpus is a collection of text style character strings or documents.
Corpus(VectorSource(tweets$text))
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 7613
# using tm package
preprocess_tweets <- function(text, prob = 0.999, extra_blacklist = c()) {
# bag of words style encoding
# removes sparse words
# input char vector of tweets, prob, extra words to remove
# words to remove
blacklist <- c(
stopwords("english"),
'û',
extra_blacklist)
text %>%
VectorSource() %>% # makes a vector input formatted for corpus()
Corpus() %>% # creates list of documents called a corpus
tm_map(PlainTextDocument) %>% # converts to plain text
tm_map(tolower) %>% # makes all lowercase
tm_map(removePunctuation) %>% # remove punctuation
tm_map(removeWords, blacklist) %>% # remove blacklisted words
tm_map(stemDocument) %>% # combine stem words, ex: jumping/jumped
DocumentTermMatrix() %>% # converts to binary matrix
removeSparseTerms(prob) %>% # keeps top 99.9% or specified % of words
as.matrix() %>%
as.tibble() %>%
suppressWarnings() %>%
return()
}
I will use the same function on text, keyword, and location to make it simple. This adds 100s of new predictors to use depending on how many sparse words I remove.
n <- nrow(tweets)
test <- read_csv('./test.csv', show_col_types = FALSE)
all_tweets <- bind_rows(tweets, test)
x_text <- preprocess_tweets(all_tweets$text)
x_location <- preprocess_tweets(all_tweets$location, 0.9995)
x_keyword <- preprocess_tweets(all_tweets$keyword, 0.9999)
names(x_text) <- paste0('text_', names(x_text))
names(x_location) <- paste0('location_', names(x_location))
names(x_keyword) <- paste0('keyword_', names(x_keyword))
y_target <- tweets$target
# only for use in kaggle predictions
test_x_text <- x_text[(n + 1):nrow(all_tweets),]
test_x_location <- x_location[(n + 1):nrow(all_tweets),]
test_x_keyword <- x_keyword[(n + 1):nrow(all_tweets),]
x_text <- x_text[1:n,]
x_location <- x_location[1:n,]
x_keyword <- x_keyword[1:n,]
tweets$text[1:4]
## [1] "Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all"
## [2] "Forest fire near La Ronge Sask. Canada"
## [3] "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"
## [4] "13,000 people receive #wildfires evacuation orders in California"
x_text[1:4, 1:10]
## # A tibble: 4 × 10
## text_earthquak text_may text_reason text_canada text_fire text_forest
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 1 0 0 0
## 2 0 0 0 1 1 1
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## # … with 4 more variables: text_near <dbl>, text_ask <dbl>, text_evacu <dbl>,
## # text_expect <dbl>
Splitting into validation and training sets.
train_group <- sample(c(TRUE, FALSE), length(y_target), replace = TRUE, prob = c(0.75, 0.25))
train_x_text <- x_text[train_group,]
train_x_location <- x_location[train_group,]
train_x_keyword <- x_keyword[train_group,]
valid_x_text <- x_text[!train_group,]
valid_x_location <- x_location[!train_group,]
valid_x_keyword <- x_keyword[!train_group,]
train_y <- y_target[train_group]
valid_y <- y_target[!train_group]
Testing accuracy with inclusion of different text fields and cost with linear svm.
train_x <- list(text = train_x_text, keyword = train_x_keyword, location = train_x_location)
valid_x <- list(text = valid_x_text, keyword = valid_x_keyword, location = valid_x_location)
c_grid <- c(0.1, 1, 10)
trys <- list(
c(T, F, F),
c(T, F, T),
c(T, T, F),
c(T, T, T)
)
options <- c('text', 'keyword', 'location')
results <- data.frame()
combinations <- length(c_grid)*length(trys)
j = 0
for (i in trys) {
for (c in c_grid) {
svm <- svm(x = bind_cols(train_x[i]), y = train_y, type = 'C', kernel = 'linear', cost = c, scale = FALSE)
pred_y <- predict(svm, newdata = bind_cols(valid_x[i]))
accuracy <- mean(pred_y == valid_y)
results <- bind_rows(results, data.frame(accuracy = accuracy, cost = c, text = i[1], keyword = i[2], location = i[3]))
j = j + 1
print(paste0(as.character(j), '/', combinations, ' completed'))
}
}
## [1] "1/12 completed"
## [1] "2/12 completed"
## [1] "3/12 completed"
## [1] "4/12 completed"
## [1] "5/12 completed"
## [1] "6/12 completed"
## [1] "7/12 completed"
## [1] "8/12 completed"
## [1] "9/12 completed"
## [1] "10/12 completed"
## [1] "11/12 completed"
## [1] "12/12 completed"
results %>% arrange(desc(accuracy))
## accuracy cost text keyword location
## 1 0.7856064 0.1 TRUE TRUE FALSE
## 2 0.7851032 0.1 TRUE TRUE TRUE
## 3 0.7800705 1.0 TRUE FALSE FALSE
## 4 0.7800705 0.1 TRUE FALSE TRUE
## 5 0.7795672 0.1 TRUE FALSE FALSE
## 6 0.7770508 1.0 TRUE FALSE TRUE
## 7 0.7770508 1.0 TRUE TRUE FALSE
## 8 0.7760443 1.0 TRUE TRUE TRUE
## 9 0.7579265 10.0 TRUE FALSE FALSE
## 10 0.7544036 10.0 TRUE TRUE FALSE
## 11 0.7468546 10.0 TRUE FALSE TRUE
## 12 0.7367891 10.0 TRUE TRUE TRUE
A model trained on text and keywords with cost at 0.1 performed the best with an accuracy of 0.7856.
train_x <- bind_cols(train_x_text, train_x_keyword)
valid_x <- bind_cols(valid_x_text, valid_x_keyword)
svm <- svm(x = train_x, y = train_y, type = 'C', kernel = 'linear', cost = 0.1, scale = FALSE)
pred_y <- predict(svm, newdata = valid_x) %>% as.character() %>% as.integer()
mean(pred_y) # % of positive predictions
## [1] 0.3301459
There definitely is some bias in predictions. It skews towards “not disaster” much like the training data. The predictions were 33% disasters and the training data had 43%. Depending on the application, I could alter the training data to represent each category equally. This can be done with either sampling or removing non disaster tweets until categories are equal.
data.frame(
predictions = pred_y,
actual = valid_y
) %>%
group_by(predictions) %>%
summarise(real_0 = mean(actual == 0), real_1 = mean(actual == 1))
## # A tibble: 2 × 3
## predictions real_0 real_1
## <int> <dbl> <dbl>
## 1 0 0.778 0.222
## 2 1 0.200 0.800
78% of non-disaster tweets were correctly labeled and 80% of disaster tweets were correctly labeled.
There are a few things I may want to do in the future:
1. fill out the rest of the keywords
2. fill out locations with places in text
3. work with the links
4. use tweets being replied to
Fit on all data.
x <- bind_rows(train_x, valid_x)
y <- c(train_y, valid_y)
svm <- svm(x = x, y = y, type = 'C', kernel = 'linear', cost = 0.1, scale = FALSE)
Predicting test data target.
pred_submit <- predict(svm, newdata = bind_cols(test_x_text, test_x_keyword))
head(data.frame(prediction = pred_submit, text = test$text), 10)
## prediction
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
## 7 0
## 8 0
## 9 0
## 10 0
## text
## 1 Just happened a terrible car crash
## 2 Heard about #earthquake is different cities, stay safe everyone.
## 3 there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all
## 4 Apocalypse lighting. #Spokane #wildfires
## 5 Typhoon Soudelor kills 28 in China and Taiwan
## 6 We're shaking...It's an earthquake
## 7 They'd probably still show more life than Arsenal did yesterday, eh? EH?
## 8 Hey! How are you?
## 9 What a nice hat?
## 10 Fuck off!