Re: [問題] 關於文字探勘消失

看板R_Language作者wheado時間11年前 (2015/08/22 13:56)推噓0(0推 0噓 0→)

留言0則, 0人參與討論串2/2 (看更多)

先安裝套件"slam" 執行下面的code ############################################################################## ## Product TermDocumentMatrix for Chinese on R-3.2.2 x64 ## Modified command "words" on package NLP wordsCN<-function(x,...){ words<-unlist(segmentCN(x$content)) return(words) } ## Modified command "termFreq" on package tm termFreqCN<- function (doc, control = list()) { stopifnot(inherits(doc, "TextDocument"), is.list(control)) .tokenize <- control$tokenize if (is.null(.tokenize) || identical(.tokenize, "wordsCN")) .tokenize <- wordsCN else if (identical(.tokenize, "MC")) .tokenize <- MC_tokenizer else if (identical(.tokenize, "scan")) .tokenize <- scan_tokenizer else if (NLP::is.Span_Tokenizer(.tokenize)) .tokenize <- NLP::as.Token_Tokenizer(.tokenize) if (is.function(.tokenize)) txt <- .tokenize(doc) else stop("invalid tokenizer") .tolower <- control$tolower if (is.null(.tolower) || isTRUE(.tolower)) .tolower <- tolower if (is.function(.tolower)) txt <- .tolower(txt) .removePunctuation <- control$removePunctuation if (isTRUE(.removePunctuation)) .removePunctuation <- removePunctuation else if (is.list(.removePunctuation)) .removePunctuation <- function(x) do.call(removePunctuation, c(list(x), control$removePunctuation)) .removeNumbers <- control$removeNumbers if (isTRUE(.removeNumbers)) .removeNumbers <- removeNumbers .stopwords <- control$stopwords if (isTRUE(.stopwords)) .stopwords <- function(x) x[is.na(match(x, stopwords(meta(doc, "language"))))] else if (is.character(.stopwords)) .stopwords <- function(x) x[is.na(match(x, control$stopwords))] .stemming <- control$stemming if (isTRUE(.stemming)) .stemming <- function(x) stemDocument(x, meta(doc, "language")) or <- c("removePunctuation", "removeNumbers", "stopwords", "stemming") nc <- names(control) n <- nc[nc %in% or] for (name in sprintf(".%s", c(n, setdiff(or, n)))) { g <- get(name) if (is.function(g)) txt <- g(txt) } if (is.null(txt)) return(setNames(integer(0), character(0))) dictionary <- control$dictionary tab <- if (is.null(dictionary)) table(txt) else table(factor(txt, levels = dictionary)) if (names(tab[1])=="") tab <- tab[-1] bl <- control$bounds$local if (length(bl) == 2L && is.numeric(bl)) tab <- tab[(tab >= bl[1]) & (tab <= bl[2])] nc <- nchar(names(tab), type = "chars") wl <- control$wordLengths lb <- if (is.numeric(wl[1])) wl[1] else 3 ub <- if (is.numeric(wl[2])) wl[2] else Inf tab <- tab[(nc >= lb) & (nc <= ub)] storage.mode(tab) <- "integer" class(tab) <- c("term_frequency", class(tab)) tab } ## Useful for TermDocumentMatrix TermDocumentMatrix_classes <- c("TermDocumentMatrix", "simple_triplet_matrix") ## Useful for TermDocumentMatrix .TermDocumentMatrix <- function(x, weighting) { x <- as.simple_triplet_matrix(x) if(!is.null(dimnames(x))) names(dimnames(x)) <- c("Terms", "Docs") class(x) <- TermDocumentMatrix_classes ## <NOTE> ## Note that if weighting is a weight function, it already needs to ## know whether we have a term-document or document-term matrix. ## ## Ideally we would require weighting to be a WeightFunction object ## or a character string of length 2. But then ## dtm <- DocumentTermMatrix(crude, ## control = list(weighting = ## function(x) ## weightTfIdf(x, normalize = ## FALSE), ## stopwords = TRUE)) ## in example("DocumentTermMatrix") fails [because weightTfIdf() is ## a weight function and not a weight function generator ...] ## Hence, for now, instead of ## if(inherits(weighting, "WeightFunction")) ## x <- weighting(x) ## use if(is.function(weighting)) x <- weighting(x) ## and hope for the best ... ## </NOTE> else if(is.character(weighting) && (length(weighting) == 2L)) attr(x, "weighting") <- weighting else stop("invalid weighting") x } ## Modified command "TermDocumentMatrix" on package tm ## and defined "TermDocumentMatrixCN" TermDocumentMatrixCN<- function (x, control = list()) { stopifnot(is.list(control)) tflist <- lapply(unname(content(x)), termFreqCN, control) tflist <- lapply(tflist, function(y) y[y > 0]) v <- unlist(tflist) i <- names(v) allTerms <- sort(unique(as.character(if (is.null(control$dictionary)) i else control$dictionary))) i <- match(i, allTerms) j <- rep(seq_along(x), sapply(tflist, length)) docs <- as.character(meta(x, "id", "local")) if (length(docs) != length(x)) { warning("invalid document identifiers") docs <- NULL } m <- simple_triplet_matrix(i = i, j = j, v = as.numeric(v), nrow = length(allTerms), ncol = length(x), dimnames = list(Terms = allTerms, Docs = docs)) bg <- control$bounds$global if (length(bg) == 2L && is.numeric(bg)) { rs <- row_sums(m > 0) m <- m[(rs >= bg[1]) & (rs <= bg[2]), ] } weighting <- control$weighting if (is.null(weighting)) weighting <- weightTf .TermDocumentMatrix(m, weighting) } ######## 以上些著試試看把TermDocumentMatrix指令換成 "TermDocumentMatrixCN"看看希望有幫助到你^^ -- ※ 發信站: 批踢踢實業坊(ptt.cc), 來自: 119.14.92.116 ※ 文章網址: https://www.ptt.cc/bbs/R_Language/M.1440222991.A.069.html