function [alpha, xmin, L]=plfit(x, varargin) % PLFIT fits a power-law distributional model to data. % Source: http://www.santafe.edu/~aaronc/powerlaws/ % % PLFIT(x) estimates x_min and alpha according to the goodness-of-fit % based method described in Clauset, Shalizi, Newman (2007). x is a % vector of observations of some quantity to which we wish to fit the % power-law distribution p(x) ~ x^-alpha for x >= xmin. % PLFIT automatically detects whether x is composed of real or integer % values, and applies the appropriate method. For discrete data, if % min(x) > 1000, PLFIT uses the continuous approximation, which is % a reliable in this regime. % % The fitting procedure works as follows: % 1) For each possible choice of x_min, we estimate alpha via the % method of maximum likelihood, and calculate the Kolmogorov-Smirnov % goodness-of-fit statistic D. % 2) We then select as our estimate of x_min, the value that gives the % minimum value D over all values of x_min. % % Note that this procedure gives no estimate of the uncertainty of the % fitted parameters, nor of the validity of the fit. % % Example: % x = (1-rand(10000,1)).^(-1/(2.5-1)); % [alpha, xmin, L] = plfit(x); % % The output 'alpha' is the maximum likelihood estimate of the scaling % exponent, 'xmin' is the estimate of the lower bound of the power-law % behavior, and L is the log-likelihood of the data x>=xmin under the % fitted power law. % % For more information, try 'type plfit' % % See also PLVAR, PLPVA % Version 1.0 (2007 May) % Version 1.0.2 (2007 September) % Version 1.0.3 (2007 September) % Version 1.0.4 (2008 January) % Version 1.0.5 (2008 March) % Version 1.0.6 (2008 July) % Version 1.0.7 (2008 October) % Version 1.0.8 (2009 February) % Version 1.0.9 (2009 October) % Version 1.0.10 (2010 January) % Version 1.0.11 (2012 January) % Copyright (C) 2008-2012 Aaron Clauset (Santa Fe Institute) % Distributed under GPL 2.0 % http://www.gnu.org/copyleft/gpl.html % PLFIT comes with ABSOLUTELY NO WARRANTY % % Notes: % % 1. In order to implement the integer-based methods in Matlab, the numeric % maximization of the log-likelihood function was used. This requires % that we specify the range of scaling parameters considered. We set % this range to be [1.50 : 0.01 : 3.50] by default. This vector can be % set by the user like so, % % a = plfit(x,'range',[1.001:0.001:5.001]); % % 2. PLFIT can be told to limit the range of values considered as estimates % for xmin in three ways. First, it can be instructed to sample these % possible values like so, % % a = plfit(x,'sample',100); % % which uses 100 uniformly distributed values on the sorted list of % unique values in the data set. Second, it can simply omit all % candidates above a hard limit, like so % % a = plfit(x,'limit',3.4); % % Finally, it can be forced to use a fixed value, like so % % a = plfit(x,'xmin',3.4); % % In the case of discrete data, it rounds the limit to the nearest % integer. % % 3. When the input sample size is small (e.g., < 100), the continuous % estimator is slightly biased (toward larger values of alpha). To % explicitly use an experimental finite-size correction, call PLFIT like % so % % a = plfit(x,'finite'); % % which does a small-size correction to alpha. % % 4. For continuous data, PLFIT can return erroneously large estimates of % alpha when xmin is so large that the number of obs x >= xmin is very % small. To prevent this, we can truncate the search over xmin values % before the finite-size bias becomes significant by calling PLFIT as % % a = plfit(x,'nosmall'); % % which skips values xmin with finite size bias > 0.1. vec = []; sample = []; xminx = []; limit = []; finite = false; nosmall = false; nowarn = false; % parse command-line parameters; trap for bad input i=1; while i<=length(varargin), argok = 1; if ischar(varargin{i}), switch varargin{i}, case 'range', vec = varargin{i+1}; i = i + 1; case 'sample', sample = varargin{i+1}; i = i + 1; case 'limit', limit = varargin{i+1}; i = i + 1; case 'xmin', xminx = varargin{i+1}; i = i + 1; case 'finite', finite = true; case 'nowarn', nowarn = true; case 'nosmall', nosmall = true; otherwise, argok=0; end end if ~argok, disp(['(PLFIT) Ignoring invalid argument #' num2str(i+1)]); end i = i+1; end if ~isempty(vec) && (~isvector(vec) || min(vec)<=1), fprintf('(PLFIT) Error: ''range'' argument must contain a vector; using default.\n'); vec = []; end; if ~isempty(sample) && (~isscalar(sample) || sample<2), fprintf('(PLFIT) Error: ''sample'' argument must be a positive integer > 1; using default.\n'); sample = []; end; if ~isempty(limit) && (~isscalar(limit) || limit= 1; using default.\n'); limit = []; end; if ~isempty(xminx) && (~isscalar(xminx) || xminx>=max(x)), fprintf('(PLFIT) Error: ''xmin'' argument must be a positive value < max(x); using default behavior.\n'); xminx = []; end; % reshape input vector x = reshape(x,numel(x),1); % select method (discrete or continuous) for fitting if isempty(setdiff(x,floor(x))), f_dattype = 'INTS'; elseif isreal(x), f_dattype = 'REAL'; else f_dattype = 'UNKN'; end; if strcmp(f_dattype,'INTS') && min(x) > 1000 && length(x)>100, f_dattype = 'REAL'; end; % estimate xmin and alpha, accordingly switch f_dattype, case 'REAL', xmins = unique(x); xmins = xmins(1:end-1); if ~isempty(xminx), xmins = xmins(find(xmins>=xminx,1,'first')); end; if ~isempty(limit), xmins(xmins>limit) = []; end; if ~isempty(sample), xmins = xmins(unique(round(linspace(1,length(xmins),sample)))); end; dat = zeros(size(xmins)); z = sort(x); for xm=1:length(xmins) xmin = xmins(xm); z = z(z>=xmin); n = length(z); % estimate alpha using direct MLE a = n ./ sum( log(z./xmin) ); if nosmall, if (a-1)/sqrt(n) > 0.1 dat(xm:end) = []; xm = length(xmins)+1; break; end; end; % compute KS statistic cx = (0:n-1)'./n; cf = 1-(xmin./z).^a; dat(xm) = max( abs(cf-cx) ); end; D = min(dat); xmin = xmins(find(dat<=D,1,'first')); z = x(x>=xmin); n = length(z); alpha = 1 + n ./ sum( log(z./xmin) ); if finite, alpha = alpha*(n-1)/n+1/n; end; % finite-size correction if n < 50 && ~finite && ~nowarn, fprintf('(PLFIT) Warning: finite-size bias may be present.\n'); end; L = n*log((alpha-1)/xmin) - alpha.*sum(log(z./xmin)); case 'INTS', if isempty(vec), vec = (1.50:0.01:3.50); % covers range of most practical end; % scaling parameters zvec = zeta(vec); xmins = unique(x); xmins = xmins(1:end-1); if ~isempty(xminx), xmins = xmins(find(xmins>=xminx,1,'first')); end; if ~isempty(limit), limit = round(limit); xmins(xmins>limit) = []; end; if ~isempty(sample), xmins = xmins(unique(round(linspace(1,length(xmins),sample)))); end; if isempty(xmins) fprintf('(PLFIT) Error: x must contain at least two unique values.\n'); alpha = NaN; xmin = x(1); D = NaN; return; end; xmax = max(x); dat = zeros(length(xmins),2); z = x; fcatch = 0; for xm=1:length(xmins) xmin = xmins(xm); z = z(z>=xmin); n = length(z); % estimate alpha via direct maximization of likelihood function if fcatch==0 try % vectorized version of numerical calculation zdiff = sum( repmat((1:xmin-1)',1,length(vec)).^-repmat(vec,xmin-1,1) ,1); L = -vec.*sum(log(z)) - n.*log(zvec - zdiff); catch % catch: force loop to default to iterative version for % remainder of the search fcatch = 1; end; end; if fcatch==1 % force iterative calculation (more memory efficient, but % can be slower) L = -Inf*ones(size(vec)); slogz = sum(log(z)); xminvec = (1:xmin-1); for k=1:length(vec) L(k) = -vec(k)*slogz - n*log(zvec(k) - sum(xminvec.^-vec(k))); end end; [Y,I] = max(L); % compute KS statistic fit = cumsum((((xmin:xmax).^-vec(I)))./ (zvec(I) - sum((1:xmin-1).^-vec(I)))); cdi = cumsum(hist(z,xmin:xmax)./n); dat(xm,:) = [max(abs( fit - cdi )) vec(I)]; end % select the index for the minimum value of D [D,I] = min(dat(:,1)); xmin = xmins(I); z = x(x>=xmin); n = length(z); alpha = dat(I,2); if finite, alpha = alpha*(n-1)/n+1/n; end; % finite-size correction if n < 50 && ~finite && ~nowarn, fprintf('(PLFIT) Warning: finite-size bias may be present.\n'); end; L = -alpha*sum(log(z)) - n*log(zvec(find(vec<=alpha,1,'last')) - sum((1:xmin-1).^-alpha)); otherwise, fprintf('(PLFIT) Error: x must contain only reals or only integers.\n'); alpha = []; xmin = []; L = []; return; end;