renaming
This commit is contained in:
59
5/data science/r/12.Rmd
Normal file
59
5/data science/r/12.Rmd
Normal file
@ -0,0 +1,59 @@
|
||||
---
|
||||
title: "Lab11: NLP"
|
||||
author: "Vladislav Litvinov <vlad@sek1ro>"
|
||||
output:
|
||||
pdf_document:
|
||||
toc_float: TRUE
|
||||
---
|
||||
How does TF-IDF enhance the relevance of a search result?
|
||||
Why reduce dimensions in text analysis?
|
||||
Curse of dimensionality
|
||||
Computational cost
|
||||
Overfitting
|
||||
|
||||
Feature selection
|
||||
Neural embeddings
|
||||
```{r}
|
||||
library("tm")
|
||||
library("wordcloud")
|
||||
library("stringr")
|
||||
data("crude")
|
||||
|
||||
docs = unlist(str_split(crude[[2]]$content, "(?<=[.!?])\\s+"))
|
||||
|
||||
docs = VCorpus(VectorSource(docs))
|
||||
|
||||
docs = tm_map(docs, content_transformer(tolower))
|
||||
docs = tm_map(docs, removeNumbers)
|
||||
docs = tm_map(docs, removeWords, stopwords("english"))
|
||||
docs = tm_map(docs, removePunctuation)
|
||||
docs = tm_map(docs, stripWhitespace)
|
||||
|
||||
sapply(docs, content)
|
||||
```
|
||||
```{r}
|
||||
docs = docs[-c(10, 18)]
|
||||
sapply(docs, content)
|
||||
```
|
||||
```{r}
|
||||
showWordCloud = function(tdm) {
|
||||
m = as.matrix(tdm)
|
||||
v = sort(rowSums(m),decreasing=TRUE)
|
||||
d = data.frame(word = names(v),freq=v)
|
||||
print(d)
|
||||
wordcloud(
|
||||
words = d$word,
|
||||
freq = d$freq,
|
||||
min.freq = 0,
|
||||
max.words = 50,
|
||||
random.order=FALSE,
|
||||
)
|
||||
}
|
||||
|
||||
showWordCloud(TermDocumentMatrix(docs))
|
||||
```
|
||||
|
||||
```{r}
|
||||
tdm = TermDocumentMatrix(docs, control = list(weighting = weightTfIdf))
|
||||
showWordCloud(tdm)
|
||||
```
|
||||
Reference in New Issue
Block a user