Commit ed28f227 authored by gsavin's avatar gsavin

Add Wikipedia generator

parent 8a625ea6
......@@ -72,6 +72,7 @@ public class URLGenerator extends BaseGenerator {
protected LinkedList<URLFilter> filters;
protected double step;
protected boolean printProgress;
protected int depthLimit;
protected final ReentrantLock lock;
public URLGenerator(String... startFrom) {
......@@ -84,10 +85,11 @@ public class URLGenerator extends BaseGenerator {
directed = false;
step = 0;
printProgress = false;
depthLimit = 0;
lock = new ReentrantLock();
declineMatchingURL("^(javascript:|mailto:|#).*");
declineMatchingURL(".*[.](avi|tar|gz|zip|mp3|mpg|jpg|jpeg|png|ogg|flv)$");
declineMatchingURL(".*[.](avi|tar|gz|zip|mp3|mpg|jpg|jpeg|png|ogg|flv|ico|svg)$");
setUseInternalGraph(true);
......@@ -112,10 +114,10 @@ public class URLGenerator extends BaseGenerator {
* @see org.graphstream.algorithm.generator.Generator#nextEvents()
*/
public boolean nextEvents() {
sendStepBegins(sourceId, step++);
sendStepBegins(sourceId, step);
sendGraphAttributeChanged(sourceId, "urls.parsed", null, urls.size());
sendGraphAttributeChanged(sourceId, "urls.remaining", null, stepUrls
.size());
sendGraphAttributeChanged(sourceId, "urls.remaining", null,
stepUrls.size());
if (printProgress)
progress();
......@@ -138,15 +140,16 @@ public class URLGenerator extends BaseGenerator {
try {
parseUrl(url);
} catch (IOException e) {
System.err.printf("Failed to parse \"%s\" : %s\n", url, e
.getMessage());
System.err.printf("Failed to parse \"%s\" : %s\n", url,
e.getMessage());
}
}
}
stepUrls.clear();
stepUrls.addAll(newUrls);
step++;
return newUrls.size() > 0;
}
......@@ -218,6 +221,15 @@ public class URLGenerator extends BaseGenerator {
this.threads = count;
}
/**
* Set the maximum steps before stop. If 0 or less, limit is disabled.
*
* @param depthLimit
*/
public void setDepthLimit(int depthLimit) {
this.depthLimit = depthLimit;
}
public void enableProgression(boolean on) {
printProgress = on;
}
......@@ -386,8 +398,8 @@ public class URLGenerator extends BaseGenerator {
href = href.trim();
if (href.charAt(0) == '/')
href = String.format("%s://%s%s", uri.getScheme(), uri
.getHost(), href);
href = String.format("%s://%s%s", uri.getScheme(),
uri.getHost(), href);
if (href.charAt(0) == '.')
href = String.format("%s%s", url, href);
......@@ -396,13 +408,19 @@ public class URLGenerator extends BaseGenerator {
continue;
try {
synchronizedOperation(href, null);
synchronizedOperation(url, href);
if (depthLimit == 0 || step < depthLimit) {
synchronizedOperation(href, null);
synchronizedOperation(url, href);
} else {
if (urls.contains(href))
synchronizedOperation(url, href);
}
} catch (URISyntaxException e) {
throw new IOException(e);
}
if (!urls.contains(href))
if (!urls.contains(href)
&& (depthLimit == 0 || step < depthLimit))
localUrls.add(href);
}
}
......@@ -414,7 +432,7 @@ public class URLGenerator extends BaseGenerator {
} finally {
lock.unlock();
}
localUrls.clear();
localUrls = null;
......@@ -445,13 +463,18 @@ public class URLGenerator extends BaseGenerator {
break;
case FULL:
nodeId = String.format("%s://%s%s%s", uri.getScheme(), uri
.getHost(), uri.getPath(), uri.getQuery());
.getHost(), uri.getPath(), uri.getQuery() == null ? ""
: uri.getQuery());
break;
}
return nodeId;
}
protected String getNodeLabel(String url) throws URISyntaxException {
return url;
}
protected String getEdgeId(String nodeId1, String nodeId2) {
if (directed || nodeId1.compareTo(nodeId2) < 0)
return String.format("%s > %s", nodeId1, nodeId2);
......@@ -474,7 +497,7 @@ public class URLGenerator extends BaseGenerator {
if (internalGraph.getNode(nodeId) == null) {
addNode(nodeId);
sendNodeAttributeAdded(sourceId, nodeId, "label", nodeId);
sendNodeAttributeAdded(sourceId, nodeId, "label", getNodeLabel(url));
// System.out.printf("> new url '%s' --> '%s'\n", url, nodeId);
}
......@@ -544,8 +567,8 @@ public class URLGenerator extends BaseGenerator {
try {
parseUrl(urls.get(i));
} catch (IOException e) {
System.err.printf("Failed to parse \"%s\" : %s\n", urls
.get(i), e.getMessage());
System.err.printf("Failed to parse \"%s\" : %s\n",
urls.get(i), e.getMessage());
}
}
}
......
/*
* Copyright 2006 - 2012
* Stefan Balev <stefan.balev@graphstream-project.org>
* Julien Baudry <julien.baudry@graphstream-project.org>
* Antoine Dutot <antoine.dutot@graphstream-project.org>
* Yoann Pigné <yoann.pigne@graphstream-project.org>
* Guilhelm Savin <guilhelm.savin@graphstream-project.org>
*
* This file is part of GraphStream <http://graphstream-project.org>.
*
* GraphStream is a library whose purpose is to handle static or dynamic
* graph, create them from scratch, file or any source and display them.
*
* This program is free software distributed under the terms of two licenses, the
* CeCILL-C license that fits European law, and the GNU Lesser General Public
* License. You can use, modify and/ or redistribute the software under the terms
* of the CeCILL-C license as circulated by CEA, CNRS and INRIA at the following
* URL <http://www.cecill.info> or under the terms of the GNU LGPL as published by
* the Free Software Foundation, either version 3 of the License, or (at your
* option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* The fact that you are presently reading this means that you have had
* knowledge of the CeCILL-C and LGPL licenses and that you accept their terms.
*/
package org.graphstream.algorithm.generator;
import java.net.URISyntaxException;
public class WikipediaGenerator extends URLGenerator {
public static final String SPECIAL_URLS = "^https://%s[.]wikipedia[.]org/wiki/(Wikipedia|File|Special|Category|Talk|Portal|Help|Template|Template_talk):.*$";
public static enum Lang {
EN("en.wikipedia.org", "Main_Page",
"Wikipedia|File|Special|Category|Talk|Portal|Help|Template|Template_talk"), FR(
"fr.wikipedia.org", "Wikipédia:Accueil_Principal",
"Wikipédia|Aide|Spécial|Catégorie|Portail|Discussion|Special")
;
final String host;
final String mainPage;
final String specialFiles;
Lang(String host, String mainPage, String special) {
this.host = host;
this.mainPage = mainPage;
this.specialFiles = special;
}
}
protected final Lang lang;
public WikipediaGenerator(String... articles) {
this(Lang.EN, articles);
}
public WikipediaGenerator(Lang lang, String... articles) {
this.lang = lang;
setDirected(true);
setMode(Mode.PATH);
addHostFilter(lang.host);
declineMatchingURL("^https?://" + lang.host + "/wiki/index.php.*");
declineMatchingURL("^https?://" + lang.host + "/wiki/" + lang.mainPage);
declineMatchingURL("^https?://" + lang.host + "/wiki/[\\w_]+:.*$");
acceptOnlyMatchingURL("^https?://" + lang.host + "/wiki/.*$");
if (articles != null)
for (int i = 0; i < articles.length; i++)
addArticle(articles[i]);
}
public void addArticle(String name) {
addURL("https://" + lang.host + "/wiki/" + name);
}
@Override
protected String getNodeLabel(String url) throws URISyntaxException {
return url.substring(url.indexOf("/wiki/") + 6);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment