ABOUT ME

-

Today
-
Yesterday
-
Total
-
  • R program(3)-Wordcloud
    Cording/R program 2020. 12. 21. 18:38

    corpus.pre <- tm_map(corpus, removeNumbers) # 숫자 모두 삭제

    corpus.pre <- tm_map(corpus.pre, removePunctuation) # 특수 문자 제거

    corpus.pre <- tm_map(corpus.pre, content_transformer(tolower)) # 대소문자의 소문자화 (Convert to lowercase)

    corpus.pre <- tm_map(corpus.pre, removeWords, words = stopwords("SMART")) # 불용어(Stopwords) 제거 (SMART 목록)

    corpus.pre <- tm_map(corpus.pre, stripWhitespace) # 공백 처리

    corpus.pre <- tm_map(corpus.pre, stemDocument, language = "en") # 어간 추출 (Stemming)

     

    dtm <- DocumentTermMatrix(corpus.pre) # DocumentTermMatrix: (문서)x(단어)

    tdm <- TermDocumentMatrix(corpus.pre) # TermDocumentMatrix: (단어)x(문서)

     

    # 단어 빈도 확인

    word.freq <- apply(dtm[,], 2, sum) # apply(x, /, 함수), : 1, : 2

    # word.freq <- apply(tdm[,], 1, sum)로 진행해도 됨

     

    # 단어 높은 빈도순으로 정렬

    word.freq.sort <- sort(word.freq, decreasing = TRUE)

    word.freq.sort[1:20]

     

    # 단어 누적빈도 구하기

    word.freq.sum <- cumsum(word.freq.sort)

    word.freq.sum[1:20]

     

    # 단어 비율 구하기

    word.freq.ratio <- word.freq.sum / word.freq.sum[length(word.freq.sum)]

    word.freq.ratio[1:20]

     

    # 단어 빈도와 비율간의 관계 파악 (그래프)

    plot(1:length(word.freq), word.freq.ratio, type = 'l', xlab = 'Order of word frequency', ylab = 'Cumulative proportion', main = "", axes = FALSE)

    axis(1, at = round(0.1*length(word.freq) * (0:10)), labels = paste(10 * (0:10), "%", sep = ""))

    axis(2, at = 0.20 * (0:5), labels = paste(20 * (0:5), "%", sep = ""))

     

    for (i in 1:9) {

    text(0.1*length(word.freq) * i, 0.05 + word.freq.ratio[0.1*length(word.freq) * i],

    labels = paste(round(100 * word.freq.ratio[0.1*length(word.freq) * i]), "%", sep = ""))

    points(0.1*length(word.freq) * i, word.freq.ratio[0.1*length(word.freq) * i], pch = 19)

    }

     

    ### Wordcloud 분석

     

    install.packages(c('wordcloud', 'RColorBrewer'))

     

    library('wordcloud')

    library('RColorBrewer')

     

    # Wordcloud 그리기

    wordcloud(names(word.freq), freq = word.freq, scale = c(4, 0.2), rot.per = 0.1, min.freq = 10, max.words = 100,

    random.order = FALSE, ordered.colors = TRUE)

    wordcloud(names(word.freq), freq = word.freq, scale = c(4, 0.2), rot.per = 0.1, min.freq = 10, random.order = FALSE,

    colors = brewer.pal(8, "Dark2"))

     

    rot.per > 세로로 10% 돌리겠다

    min.freq=10 > 최소 10

    mix.freq=100 > 최대

     

    # 추가: R의 색 패턴 보기

    display.brewer.all()

     

    # Wordcloud2 이용하여 응용

    install.packages('wordcloud2')

     

    library(wordcloud2)

     

    word.freq.wordcloud.df2 <- data.frame(unlist(names(word.freq.sort)), unname(word.freq.sort))

     

    word.freq.wordcloud.df <- data.frame(unlist(names(word.freq)), unname(word.freq))

    colnames(word.freq.wordcloud.df) <- c('word', 'freq')

     

    word.freq.wordcloud <- wordcloud2(word.freq.wordcloud.df, shape = 'circle', size = 0.7, minSize = 10)

    # shape: circle, cardioid, diamond, triangle-forward, triangle, pentagon, star’

    word.freq.wordcloud

     

    # 추가: WordCloud2 결과물 저장 (HTML, PDF)

    install.packages('webshot')

     

    library(webshot)

    webshot::install_phantomjs()

     

    library("htmlwidgets")

    saveWidget(word.freq.wordcloud, "wordcloud.html", selfcontained = F)

    webshot("wordcloud.html", "wordcloud.pdf", delay = 5, vwidth = 480, vheight = 480)

     

    'Cording > R program' 카테고리의 다른 글

    R program(5)  (0) 2020.12.23
    R program(4)  (0) 2020.12.22
    R program(2)-text 전처리  (0) 2020.12.20
    R(2)  (0) 2020.12.19
    about R-programming  (0) 2020.12.17

    댓글

Designed by Tistory.