Warning

This documents an unmaintained version of NetworkX. Please upgrade to a maintained version and see the current NetworkX documentation.

Source code for networkx.algorithms.link_analysis.pagerank_alg

"""PageRank analysis of graph structure. """
#    Copyright (C) 2004-2013 by
#    Aric Hagberg <hagberg@lanl.gov>
#    Dan Schult <dschult@colgate.edu>
#    Pieter Swart <swart@lanl.gov>
#    All rights reserved.
#    BSD license.
#    NetworkX:http://networkx.lanl.gov/
import networkx as nx
from networkx.exception import NetworkXError
from networkx.utils import not_implemented_for
__author__ = """\n""".join(["Aric Hagberg <aric.hagberg@gmail.com>",
                            "Brandon Liu <brandon.k.liu@gmail.com"])
__all__ = ['pagerank', 'pagerank_numpy', 'pagerank_scipy', 'google_matrix']


@not_implemented_for('multigraph')
[docs]def pagerank(G, alpha=0.85, personalization=None, max_iter=100, tol=1.0e-6, nstart=None, weight='weight', dangling=None): """Return the PageRank of the nodes in the graph. PageRank computes a ranking of the nodes in the graph G based on the structure of the incoming links. It was originally designed as an algorithm to rank web pages. Parameters ----------- G : graph A NetworkX graph. Undirected graphs will be converted to a directed graph with two directed edges for each undirected edge. alpha : float, optional Damping parameter for PageRank, default=0.85. personalization: dict, optional The "personalization vector" consisting of a dictionary with a key for every graph node and nonzero personalization value for each node. By default, a uniform distribution is used. max_iter : integer, optional Maximum number of iterations in power method eigenvalue solver. tol : float, optional Error tolerance used to check convergence in power method solver. nstart : dictionary, optional Starting value of PageRank iteration for each node. weight : key, optional Edge data key to use as weight. If None weights are set to 1. dangling: dict, optional The outedges to be assigned to any "dangling" nodes, i.e., nodes without any outedges. The dict key is the node the outedge points to and the dict value is the weight of that outedge. By default, dangling nodes are given outedges according to the personalization vector (uniform if not specified). This must be selected to result in an irreducible transition matrix (see notes under google_matrix). It may be common to have the dangling dict to be the same as the personalization dict. Returns ------- pagerank : dictionary Dictionary of nodes with PageRank as value Examples -------- >>> G = nx.DiGraph(nx.path_graph(4)) >>> pr = nx.pagerank(G, alpha=0.9) Notes ----- The eigenvector calculation is done by the power iteration method and has no guarantee of convergence. The iteration will stop after max_iter iterations or an error tolerance of number_of_nodes(G)*tol has been reached. The PageRank algorithm was designed for directed graphs but this algorithm does not check if the input graph is directed and will execute on undirected graphs by converting each edge in the directed graph to two edges. See Also -------- pagerank_numpy, pagerank_scipy, google_matrix References ---------- .. [1] A. Langville and C. Meyer, "A survey of eigenvector methods of web information retrieval." http://citeseer.ist.psu.edu/713792.html .. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry, The PageRank citation ranking: Bringing order to the Web. 1999 http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=1999-66&format=pdf """ if len(G) == 0: return {} if not G.is_directed(): D = G.to_directed() else: D = G # Create a copy in (right) stochastic form W = nx.stochastic_graph(D, weight=weight) N = W.number_of_nodes() # Choose fixed starting vector if not given if nstart is None: x = dict.fromkeys(W, 1.0 / N) else: # Normalized nstart vector s = float(sum(nstart.values())) x = dict((k, v / s) for k, v in nstart.items()) if personalization is None: # Assign uniform personalization vector if not given p = dict.fromkeys(W, 1.0 / N) else: missing = set(G) - set(personalization) if missing: raise NetworkXError('Personalization dictionary ' 'must have a value for every node. ' 'Missing nodes %s' % missing) s = float(sum(personalization.values())) p = dict((k, v / s) for k, v in personalization.items()) if dangling is None: # Use personalization vector if dangling vector not specified dangling_weights = p else: missing = set(G) - set(dangling) if missing: raise NetworkXError('Dangling node dictionary ' 'must have a value for every node. ' 'Missing nodes %s' % missing) s = float(sum(dangling.values())) dangling_weights = dict((k, v/s) for k, v in dangling.items()) dangling_nodes = [n for n in W if W.out_degree(n, weight=weight) == 0.0] # power iteration: make up to max_iter iterations for _ in range(max_iter): xlast = x x = dict.fromkeys(xlast.keys(), 0) danglesum = alpha * sum(xlast[n] for n in dangling_nodes) for n in x: # this matrix multiply looks odd because it is # doing a left multiply x^T=xlast^T*W for nbr in W[n]: x[nbr] += alpha * xlast[n] * W[n][nbr][weight] x[n] += danglesum * dangling_weights[n] + (1.0 - alpha) * p[n] # check convergence, l1 norm err = sum([abs(x[n] - xlast[n]) for n in x]) if err < N*tol: return x raise NetworkXError('pagerank: power iteration failed to converge ' 'in %d iterations.' % max_iter)
[docs]def google_matrix(G, alpha=0.85, personalization=None, nodelist=None, weight='weight', dangling=None): """Return the Google matrix of the graph. Parameters ----------- G : graph A NetworkX graph. Undirected graphs will be converted to a directed graph with two directed edges for each undirected edge. alpha : float The damping factor. personalization: dict, optional The "personalization vector" consisting of a dictionary with a key for every graph node and nonzero personalization value for each node. By default, a uniform distribution is used. nodelist : list, optional The rows and columns are ordered according to the nodes in nodelist. If nodelist is None, then the ordering is produced by G.nodes(). weight : key, optional Edge data key to use as weight. If None weights are set to 1. dangling: dict, optional The outedges to be assigned to any "dangling" nodes, i.e., nodes without any outedges. The dict key is the node the outedge points to and the dict value is the weight of that outedge. By default, dangling nodes are given outedges according to the personalization vector (uniform if not specified) This must be selected to result in an irreducible transition matrix (see notes below). It may be common to have the dangling dict to be the same as the personalization dict. Returns ------- A : NumPy matrix Google matrix of the graph Notes ----- The matrix returned represents the transition matrix that describes the Markov chain used in PageRank. For PageRank to converge to a unique solution (i.e., a unique stationary distribution in a Markov chain), the transition matrix must be irreducible. In other words, it must be that there exists a path between every pair of nodes in the graph, or else there is the potential of "rank sinks." This implementation works with Multi(Di)Graphs. For multigraphs the weight between two nodes is set to be the sum of all edge weights between those nodes. See Also -------- pagerank, pagerank_numpy, pagerank_scipy """ import numpy as np if nodelist is None: nodelist = G.nodes() M = nx.to_numpy_matrix(G, nodelist=nodelist, weight=weight) N = len(G) if N == 0: return M # Personalization vector if personalization is None: p = np.repeat(1.0 / N, N) else: missing = set(nodelist) - set(personalization) if missing: raise NetworkXError('Personalization vector dictionary ' 'must have a value for every node. ' 'Missing nodes %s' % missing) p = np.array([personalization[n] for n in nodelist], dtype=float) p /= p.sum() # Dangling nodes if dangling is None: dangling_weights = p else: missing = set(nodelist) - set(dangling) if missing: raise NetworkXError('Dangling node dictionary ' 'must have a value for every node. ' 'Missing nodes %s' % missing) # Convert the dangling dictionary into an array in nodelist order dangling_weights = np.array([dangling[n] for n in nodelist], dtype=float) dangling_weights /= dangling_weights.sum() dangling_nodes = np.where(M.sum(axis=1) == 0)[0] # Assign dangling_weights to any dangling nodes (nodes with no out links) for node in dangling_nodes: M[node] = dangling_weights M /= M.sum(axis=1) # Normalize rows to sum to 1 return alpha * M + (1 - alpha) * np.outer(np.ones(N), p)
[docs]def pagerank_numpy(G, alpha=0.85, personalization=None, weight='weight', dangling=None): """Return the PageRank of the nodes in the graph. PageRank computes a ranking of the nodes in the graph G based on the structure of the incoming links. It was originally designed as an algorithm to rank web pages. Parameters ----------- G : graph A NetworkX graph. Undirected graphs will be converted to a directed graph with two directed edges for each undirected edge. alpha : float, optional Damping parameter for PageRank, default=0.85. personalization: dict, optional The "personalization vector" consisting of a dictionary with a key for every graph node and nonzero personalization value for each node. By default, a uniform distribution is used. weight : key, optional Edge data key to use as weight. If None weights are set to 1. dangling: dict, optional The outedges to be assigned to any "dangling" nodes, i.e., nodes without any outedges. The dict key is the node the outedge points to and the dict value is the weight of that outedge. By default, dangling nodes are given outedges according to the personalization vector (uniform if not specified) This must be selected to result in an irreducible transition matrix (see notes under google_matrix). It may be common to have the dangling dict to be the same as the personalization dict. Returns ------- pagerank : dictionary Dictionary of nodes with PageRank as value. Examples -------- >>> G = nx.DiGraph(nx.path_graph(4)) >>> pr = nx.pagerank_numpy(G, alpha=0.9) Notes ----- The eigenvector calculation uses NumPy's interface to the LAPACK eigenvalue solvers. This will be the fastest and most accurate for small graphs. This implementation works with Multi(Di)Graphs. For multigraphs the weight between two nodes is set to be the sum of all edge weights between those nodes. See Also -------- pagerank, pagerank_scipy, google_matrix References ---------- .. [1] A. Langville and C. Meyer, "A survey of eigenvector methods of web information retrieval." http://citeseer.ist.psu.edu/713792.html .. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry, The PageRank citation ranking: Bringing order to the Web. 1999 http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=1999-66&format=pdf """ import numpy as np if len(G) == 0: return {} M = google_matrix(G, alpha, personalization=personalization, weight=weight, dangling=dangling) # use numpy LAPACK solver eigenvalues, eigenvectors = np.linalg.eig(M.T) ind = eigenvalues.argsort() # eigenvector of largest eigenvalue at ind[-1], normalized largest = np.array(eigenvectors[:, ind[-1]]).flatten().real norm = float(largest.sum()) return dict(zip(G, map(float, largest / norm)))
[docs]def pagerank_scipy(G, alpha=0.85, personalization=None, max_iter=100, tol=1.0e-6, weight='weight', dangling=None): """Return the PageRank of the nodes in the graph. PageRank computes a ranking of the nodes in the graph G based on the structure of the incoming links. It was originally designed as an algorithm to rank web pages. Parameters ----------- G : graph A NetworkX graph. Undirected graphs will be converted to a directed graph with two directed edges for each undirected edge. alpha : float, optional Damping parameter for PageRank, default=0.85. personalization: dict, optional The "personalization vector" consisting of a dictionary with a key for every graph node and nonzero personalization value for each node. By default, a uniform distribution is used. max_iter : integer, optional Maximum number of iterations in power method eigenvalue solver. tol : float, optional Error tolerance used to check convergence in power method solver. weight : key, optional Edge data key to use as weight. If None weights are set to 1. dangling: dict, optional The outedges to be assigned to any "dangling" nodes, i.e., nodes without any outedges. The dict key is the node the outedge points to and the dict value is the weight of that outedge. By default, dangling nodes are given outedges according to the personalization vector (uniform if not specified) This must be selected to result in an irreducible transition matrix (see notes under google_matrix). It may be common to have the dangling dict to be the same as the personalization dict. Returns ------- pagerank : dictionary Dictionary of nodes with PageRank as value Examples -------- >>> G = nx.DiGraph(nx.path_graph(4)) >>> pr = nx.pagerank_scipy(G, alpha=0.9) Notes ----- The eigenvector calculation uses power iteration with a SciPy sparse matrix representation. This implementation works with Multi(Di)Graphs. For multigraphs the weight between two nodes is set to be the sum of all edge weights between those nodes. See Also -------- pagerank, pagerank_numpy, google_matrix References ---------- .. [1] A. Langville and C. Meyer, "A survey of eigenvector methods of web information retrieval." http://citeseer.ist.psu.edu/713792.html .. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry, The PageRank citation ranking: Bringing order to the Web. 1999 http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=1999-66&format=pdf """ import scipy.sparse N = len(G) if N == 0: return {} nodelist = G.nodes() M = nx.to_scipy_sparse_matrix(G, nodelist=nodelist, weight=weight, dtype=float) S = scipy.array(M.sum(axis=1)).flatten() S[S != 0] = 1.0 / S[S != 0] Q = scipy.sparse.spdiags(S.T, 0, *M.shape, format='csr') M = Q * M # initial vector x = scipy.repeat(1.0 / N, N) # Personalization vector if personalization is None: p = scipy.repeat(1.0 / N, N) else: missing = set(nodelist) - set(personalization) if missing: raise NetworkXError('Personalization vector dictionary ' 'must have a value for every node. ' 'Missing nodes %s' % missing) p = scipy.array([personalization[n] for n in nodelist], dtype=float) p = p / p.sum() # Dangling nodes if dangling is None: dangling_weights = p else: missing = set(nodelist) - set(dangling) if missing: raise NetworkXError('Dangling node dictionary ' 'must have a value for every node. ' 'Missing nodes %s' % missing) # Convert the dangling dictionary into an array in nodelist order dangling_weights = scipy.array([dangling[n] for n in nodelist], dtype=float) dangling_weights /= dangling_weights.sum() is_dangling = scipy.where(S == 0)[0] # power iteration: make up to max_iter iterations for _ in range(max_iter): xlast = x x = alpha * (x * M + sum(x[is_dangling]) * dangling_weights) + \ (1 - alpha) * p # check convergence, l1 norm err = scipy.absolute(x - xlast).sum() if err < N * tol: return dict(zip(nodelist, map(float, x))) raise NetworkXError('pagerank_scipy: power iteration failed to converge ' 'in %d iterations.' % max_iter) # fixture for nose tests
def setup_module(module): from nose import SkipTest try: import numpy except: raise SkipTest("NumPy not available") try: import scipy except: raise SkipTest("SciPy not available")