##language:zh #pragma section-numbers on ''' '''PyPageRankGuess''' -- 猜想实现 Google Page Rank ''' ::-- ZoomQuiet [<>] <> ## 默许导航,请保留 <> = 算法實作 = * '''Tsung's Blog''' -- [[http://plog.longwin.com.tw/post/1/484|Google PageRank 演算法實作(Python版)]] == Algorithm in 126 Lines == {{{#!python # PageRank algorithm # By Peter Bengtsson # http://www.peterbe.com/ # mail@peterbe.com # # Requires the numarray module # http://www.stsci.edu/resources/software_hardware/numarray from numarray import * import numarray.linear_algebra as la def _sum_sequence(seq): """ sums up a sequence """ def _add(x,y): return x+y return reduce(_add, seq, 0) class PageRanker: def __init__(self, p, webmatrix): assert p>=0 and p <= 1 self.p = float(p) if type(webmatrix)in [type([]), type(())]: webmatrix = array(webmatrix, Float) assert webmatrix.shape[0]==webmatrix.shape[1] self.webmatrix = webmatrix # create the deltamatrix imatrix = identity(webmatrix.shape[0], Float) for i in range(webmatrix.shape[0]): imatrix[i] = imatrix[i]*sum(webmatrix[i,:]) deltamatrix = la.inverse(imatrix) self.deltamatrix = deltamatrix # create the fmatrix self.fmatrix = ones(webmatrix.shape, Float) self.sigma = webmatrix.shape[0] # calculate the Stochastic matrix _f_normalized = (self.sigma**-1)*self.fmatrix _randmatrix = (1-p)*_f_normalized _linkedmatrix = p * matrixmultiply(deltamatrix, webmatrix) M = _randmatrix + _linkedmatrix self.stochasticmatrix = M self.invariantmeasure = ones((1, webmatrix.shape[0]), Float) def improve_guess(self, times=1): for i in range(times): self._improve() def _improve(self): self.invariantmeasure = matrixmultiply(self.invariantmeasure, self.stochasticmatrix) def get_invariant_measure(self): return self.invariantmeasure def getPageRank(self): sum = _sum_sequence(self.invariantmeasure[0]) copy = self.invariantmeasure[0] for i in range(len(copy)): copy[i] = copy[i]/sum return copy if __name__=='__main__': # Example usage web = ((0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1), (1, 0, 0, 0)) pr = PageRanker(0.85, web) pr.improve_guess(100) print pr.getPageRank() }}} == PageRank-in-Python == {{{#!python #!/usr/bin/env python from numpy import * def pageRankGenerator( At = [array((), int32)], numLinks = array((), int32), ln = array((), int32), alpha = 0.85, convergence = 0.01, checkSteps = 10 ): """ Compute an approximate page rank vector of N pages to within some convergence factor. @param At a sparse square matrix with N rows. At[ii] contains the indices of pages jj linking to ii. @param numLinks iNumLinks[ii] is the number of links going out from ii. @param ln contains the indices of pages without links @param alpha a value between 0 and 1. Determines the relative importance of "stochastic" links. @param convergence a relative convergence criterion. smaller means better, but more expensive. @param checkSteps check for convergence after so many steps """ # the number of "pages" N = len(At) # the number of "pages without links" M = ln.shape[0] # initialize: single-precision should be good enough iNew = ones((N,), float32) / N iOld = ones((N,), float32) / N done = False while not done: # normalize every now and then for numerical stability iNew /= sum(iNew) for step in range(checkSteps): # swap arrays iOld, iNew = iNew, iOld # an element in the 1 x I vector. # all elements are identical. oneIv = (1 - alpha) * sum(iOld) / N # an element of the A x I vector. # all elements are identical. oneAv = 0.0 if M > 0: oneAv = alpha * sum(iOld.take(ln, axis = 0)) * M / N # the elements of the H x I multiplication ii = 0 while ii < N: page = At[ii] h = 0 if page.shape[0]: h = alpha * dot( iOld.take(page, axis = 0), 1. / numLinks.take(page, axis = 0) ) iNew[ii] = h + oneAv + oneIv ii += 1 diff = iNew - iOld done = (sqrt(dot(diff, diff)) / N < convergence) yield iNew def transposeLinkMatrix( outGoingLinks = [[]] ): """ Transpose the link matrix. The link matrix contains the pages each page points to. But what we want is to know which pages point to a given page, while retaining information about how many links each page contains (so store that in a separate array), as well as which pages contain no links at all (leaf nodes). @param outGoingLinks outGoingLinks[ii] contains the indices of pages pointed to by page ii @return a tuple of (incomingLinks, numOutGoingLinks, leafNodes) """ nPages = len(outGoingLinks) # incomingLinks[ii] will contain the indices jj of the pages linking to page ii incomingLinks = [[] for ii in range(nPages)] # the number of links in each page numLinks = zeros(nPages, int32) # the indices of the leaf nodes leafNodes = [] for ii in range(nPages): if len(outGoingLinks[ii]) == 0: leafNodes.append(ii) else: numLinks[ii] = len(outGoingLinks[ii]) # transpose the link matrix for jj in outGoingLinks[ii]: incomingLinks[jj].append(ii) incomingLinks = [array(ii) for ii in incomingLinks] numLinks = array(numLinks) leafNodes = array(leafNodes) return incomingLinks, numLinks, leafNodes def pageRank( linkMatrix = [[]], alpha = 0.85, convergence = 0.01, checkSteps = 10 ): """ Convenience wrap for the link matrix transpose and the generator. @see pageRankGenerator for parameter description """ incomingLinks, numLinks, leafNodes = transposeLinkMatrix(linkMatrix) for gr in pageRankGenerator(incomingLinks, numLinks, leafNodes, alpha = alpha, convergence = convergence, checkSteps = checkSteps): final = gr return final }}} = 反馈 =