############################################################################### # # library # library(VGAM) # zeta function library(R.matlab) # read matlab file just for test purpose # ############################################################################### # # test zone # ############################################################################### runtest <- function(){ #read a matlab file #x generated by x = randht(10000,'xmin',15,'powerlaw',2.5) with matlab (continuous law) x<-readMat(file("x_2.5_15_10000.mat","rb"))\$x[,1] #plfit matlab:alpha=2.4975,xmin=18.5752,D=0.0062 plfit(x) #plfit R:alpha=2.497485,xmin=18.57524,D=0.006174977 #x0 generated by x0 =floor(x) with matlab (big discrete approximation) x0<-readMat(file("x0_2.5_15_10000.mat","rb"))\$x[,1] #plfit matlab:alpha=2.4700,xmin=15,D=0.0056 plfit(x0) #plfit r:alpha=2.47,xmin=15,D=0.00564463 } ############################################################################### # # PLFIT fits a power-law distributional model to data. # # PLFIT(x) estimates x_min and alpha according to the goodness-of-fit # based method described in Clauset, Shalizi, Newman (2007). x is a # vector of observations of some quantity to which we wish to fit the # power-law distribution p(x) ~ x^-alpha for x >= xmin. # PLFIT automatically detects whether x is composed of real or integer # values, and applies the appropriate method. For discrete data, if # min(x) > 1000, PLFIT uses the continuous approximation, which is # a reliable in this regime. # # The fitting procedure works as follows: # 1) For each possible choice of x_min, we estimate alpha via the # method of maximum likelihood, and calculate the Kolmogorov-Smirnov # goodness-of-fit statistic D. # 2) We then select as our estimate of x_min, the value that gives the # minimum value D over all values of x_min. # # Note that this procedure gives no estimate of the uncertainty of the # fitted parameters, nor of the validity of the fit. # # Example: # x <- (1-runif(10000))^(-1/(2.5-1)) # plfit(x) # # # Version 1.0 (2008 February) # Version 1.1 (2008 February) # - correction : division by zero if limit >= max(x) because the unique R function do no sort # and the matlab function do... # Version 1.1 (minor correction 2009 August) # - correction : lines 230 zdiff calcul was wrong when xmin=0 (thanks to Naoki Masuda) # - gpl version updated to v3.0 (asked by Felipe Ortega) # Version 1.2 (2011 August) # - correction for method "limit" thanks to David R. Pugh # xmins <- xmins[xmins<=limit] is now xmins <- xmins[xmins>=limit] # - "fixed" method added for xmins from David R. Pugh # - modifications by Alan Di Vittorio: # - correction : zdiff calculation was wrong when xmin==1 # - the previous zdiff correction was incorrect # - correction : x has to have at least two unique values # - additional discrete x input test : discrete x cannot contain the value 0 # - added option to truncate continuous xmin search when # of obs gets small # # Copyright (C) 2008,2011 Laurent Dubroca laurent.dubroca_at_gmail.com # (Stazione Zoologica Anton Dohrn, Napoli, Italy) # Distributed under GPL 3.0 # http://www.gnu.org/copyleft/gpl.html # PLFIT comes with ABSOLUTELY NO WARRANTY # Matlab to R translation based on the original code of Aaron Clauset (Santa Fe Institute) # Source: http://www.santafe.edu/~aaronc/powerlaws/ # # Notes: # # 1. In order to implement the integer-based methods in Matlab, the numeric # maximization of the log-likelihood function was used. This requires # that we specify the range of scaling parameters considered. We set # this range to be seq(1.5,3.5,0.01) by default. This vector can be # set by the user like so, # # a <- plfit(x,"range",seq(1.001,5,0.001)) # # 2. PLFIT can be told to limit the range of values considered as estimates # for xmin in two ways. First, it can be instructed to sample these # possible values like so, # # a <- plfit(x,"sample",100) # # which uses 100 uniformly distributed values on the sorted list of # unique values in the data set. Alternatively, it can simply omit all # candidates below a hard limit, like so # # a <- plfit(x,"limit",3.4) # # In the case of discrete data, it rounds the limit to the nearest # integer. # # Finally, if you wish to force the threshold parameter to take a specific value # (useful for bootstrapping), simply call plfit() like so # # a <- plfit(x,"fixed",3.5) # # 3. When the input sample size is small (e.g., < 100), the estimator is # known to be slightly biased (toward larger values of alpha). To # explicitly use an experimental finite-size correction, call PLFIT like # so # # a <- plfit(x,finite=TRUE) # # 4. For continuous data, PLFIT can return erroneously large estimates of # alpha when xmin is so large that the number of obs x >= xmin is very # small. To prevent this, we can truncate the search over xmin values # before the finite-size bias becomes significant by calling PLFIT as # # a = plfit(x,nosmall=TRUE); # # which skips values xmin with finite size bias > 0.1. # ############################################################################### plfit<-function(x=rpareto(1000,10,2.5),method="limit",value=c(),finite=FALSE,nowarn=FALSE,nosmall=FALSE){ #init method value to NULL vec <- c() ; sampl <- c() ; limit <- c(); fixed <- c() ########################################################################################### # # test and trap for bad input # switch(method, range = vec <- value, sample = sampl <- value, limit = limit <- value, fixed = fixed <- value, argok <- 0) if(exists("argok")){stop("(plfit) Unrecognized method")} if( !is.null(vec) && (!is.vector(vec) || min(vec)<=1 || length(vec)<=1) ){ print(paste("(plfit) Error: ''range'' argument must contain a vector > 1; using default.")) vec <- c() } if( !is.null(sampl) && ( !(sampl==floor(sampl)) || length(sampl)>1 || sampl<2 ) ){ print(paste("(plfit) Error: ''sample'' argument must be a positive integer > 2; using default.")) sample <- c() } if( !is.null(limit) && (length(limit)>1 || limit<1) ){ print(paste("(plfit) Error: ''limit'' argument must be a positive >=1; using default.")) limit <- c() } if( !is.null(fixed) && (length(fixed)>1 || fixed<=0) ){ print(paste("(plfit) Error: ''fixed'' argument must be a positive >0; using default.")) fixed <- c() } # select method (discrete or continuous) for fitting and test if x is a vector fdattype<-"unknow" if( is.vector(x,"numeric") ){ fdattype<-"real" } if( all(x==floor(x)) && is.vector(x) ){ fdattype<-"integer" } if( all(x==floor(x)) && min(x) > 1000 && length(x) > 100 ){ fdattype <- "real" } if( fdattype=="unknow" ){ stop("(plfit) Error: x must contain only reals or only integers.") } # # end test and trap for bad input # ########################################################################################### ########################################################################################### # # estimate xmin and alpha in the continuous case # if( fdattype=="real" ){ xmins <- sort(unique(x)) xmins <- xmins[-length(xmins)] if( !is.null(limit) ){ xmins <- xmins[xmins>=limit] } if( !is.null(fixed) ){ xmins <- fixed } if( !is.null(sampl) ){ xmins <- xmins[unique(round(seq(1,length(xmins),length.out=sampl)))] } dat <- rep(0,length(xmins)) z <- sort(x) for( xm in 1:length(xmins) ){ xmin <- xmins[xm] z <- z[z>=xmin] n <- length(z) # estimate alpha using direct MLE a <- n/sum(log(z/xmin)) # truncate search if nosmall is selected if( nosmall ){ if((a-1)/sqrt(n) > 0.1){ dat <- dat[1:(xm-1)] print(paste("(plfit) Warning : xmin search truncated beyond",xmins[xm-1])) break } } # compute KS statistic cx <- c(0:(n-1))/n cf <- 1-(xmin/z)^a dat[xm] <- max(abs(cf-cx)) } D <- min(dat) xmin <- xmins[min(which(dat<=D))] z <- x[x>=xmin] n <- length(z) alpha <- 1 + n/sum(log(z/xmin)) if( finite ){ alpha <- alpha*(n-1)/n+1/n # finite-size correction } if( n<50 && !finite && !nowarn){ print("(plfit) Warning : finite-size bias may be present") } } # # end continuous case # ########################################################################################### ########################################################################################### # # estimate xmin and alpha in the discrete case # if( fdattype=="integer" ){ if( is.null(vec) ){ vec<-seq(1.5,3.5,.01) } # covers range of most practical scaling parameters zvec <- zeta(vec) xmins <- sort(unique(x)) xmins <- xmins[-length(xmins)] if( !is.null(limit) ){ limit <- round(limit) xmins <- xmins[xmins>=limit] } if( !is.null(fixed) ){ xmins <- fixed } if( !is.null(sampl) ){ xmins <- xmins[unique(round(seq(1,length(xmins),length.out=sampl)))] } if( is.null(xmins) || length(xmins) < 2){ stop("(plfit) error: x must contain at least two unique values.") } if(length(which(xmins==0) > 0)){ stop("(plfit) error: x must not contain the value 0.") } xmax <- max(x) dat <- matrix(0,nrow=length(xmins),ncol=2) z <- x for( xm in 1:length(xmins) ){ xmin <- xmins[xm] z <- z[z>=xmin] n <- length(z) # estimate alpha via direct maximization of likelihood function # vectorized version of numerical calculation # matlab: zdiff = sum( repmat((1:xmin-1)',1,length(vec)).^-repmat(vec,xmin-1,1) ,1); if(xmin==1){ zdiff <- rep(0,length(vec)) }else{ zdiff <- apply(rep(t(1:(xmin-1)),length(vec))^-t(kronecker(t(array(1,xmin-1)),vec)),2,sum) } # matlab: L = -vec.*sum(log(z)) - n.*log(zvec - zdiff); L <- -vec*sum(log(z)) - n*log(zvec - zdiff); I <- which.max(L) # compute KS statistic fit <- cumsum((((xmin:xmax)^-vec[I])) / (zvec[I] - sum((1:(xmin-1))^-vec[I]))) cdi <- cumsum(hist(z,c(min(z)-1,(xmin+.5):xmax,max(z)+1),plot=FALSE)\$counts/n) dat[xm,] <- c(max(abs( fit - cdi )),vec[I]) } D <- min(dat[,1]) I <- which.min(dat[,1]) xmin <- xmins[I] n <- sum(x>=xmin) alpha <- dat[I,2] if( finite ){ alpha <- alpha*(n-1)/n+1/n # finite-size correction } if( n<50 && !finite && !nowarn){ print("(plfit) Warning : finite-size bias may be present") } } # # end discrete case # ########################################################################################### # return xmin, alpha and D in a list return(list(xmin=xmin,alpha=alpha,D=D)) }