59 lines
1.2 KiB
Plaintext
59 lines
1.2 KiB
Plaintext
---
|
|
title: "Lab11: NLP"
|
|
author: "Vladislav Litvinov <vlad@sek1ro>"
|
|
output:
|
|
pdf_document:
|
|
toc_float: TRUE
|
|
---
|
|
How does TF-IDF enhance the relevance of a search result?
|
|
Why reduce dimensions in text analysis?
|
|
Curse of dimensionality
|
|
Computational cost
|
|
Overfitting
|
|
|
|
Feature selection
|
|
Neural embeddings
|
|
```{r}
|
|
library("tm")
|
|
library("wordcloud")
|
|
library("stringr")
|
|
data("crude")
|
|
|
|
docs = unlist(str_split(crude[[2]]$content, "(?<=[.!?])\\s+"))
|
|
|
|
docs = VCorpus(VectorSource(docs))
|
|
|
|
docs = tm_map(docs, content_transformer(tolower))
|
|
docs = tm_map(docs, removeNumbers)
|
|
docs = tm_map(docs, removeWords, stopwords("english"))
|
|
docs = tm_map(docs, removePunctuation)
|
|
docs = tm_map(docs, stripWhitespace)
|
|
|
|
sapply(docs, content)
|
|
```
|
|
```{r}
|
|
docs = docs[-c(10, 18)]
|
|
sapply(docs, content)
|
|
```
|
|
```{r}
|
|
showWordCloud = function(tdm) {
|
|
m = as.matrix(tdm)
|
|
v = sort(rowSums(m),decreasing=TRUE)
|
|
d = data.frame(word = names(v),freq=v)
|
|
print(d)
|
|
wordcloud(
|
|
words = d$word,
|
|
freq = d$freq,
|
|
min.freq = 0,
|
|
max.words = 50,
|
|
random.order=FALSE,
|
|
)
|
|
}
|
|
|
|
showWordCloud(TermDocumentMatrix(docs))
|
|
```
|
|
|
|
```{r}
|
|
tdm = TermDocumentMatrix(docs, control = list(weighting = weightTfIdf))
|
|
showWordCloud(tdm)
|
|
``` |