import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
* Parses a wikimedia site, extracts URLs, runs a series of regexes against the page,
* outputs it to a directory and / or posts to a jspwiki.
* Follows in-wiki matches. Tested and mostly works with a site of about 50 pages, shouldn't have any quantity limits.
*
* Doesn't maintain http sessions, so can't support auth.
* Please post any improvements or pointers here.
*
* @author vid
*
*/
public class PageCobbler {
private String startpage = "Main_Page;SOME_media;Sketch"; // where to start; typically Main_Page, semicolon separated.
private boolean DIE_ON_POST = false; // should we return a runtime exception if post returns Oops?
final static String prefix = "/wiki/"; // wiki http base, may also be /wiki/index.php...
final static String basedomain = "http://wiki.yourdomain.org";
final private String destsite = "http://wiki.yourdomain.org:8080/wiki/";
final static String defdir = null; // null, or where to put a local copy of pages (you must create the directory)
private static final String ENCODING = "UTF-8";
Pattern plinks;
private Hashtable<String, String> wrx; // regexes on page output; see setup()
Map<String, String> prx; //regexes on page retrieval; see setup()
private Set<String> done;
private Vector<String> todo;
public static void main(String[] args) throws Exception {
PageCobbler pc = new PageCobbler();
pc.go();
}
public void go() throws Exception {
setup();
for (String s: startpage.split(";")) {
todo.add(s);
}
String contents = null;
List<String> l = null;
while (!todo.isEmpty()) {
System.out.println(todo.size() + " to go.");
String page = todo.get(0);
todo.remove(page);
contents = fetch(basedomain + prefix + page);
contents = doregex(contents, prx);
l = getLinks(contents);
addLinks(l);
contents = doregex(contents, wrx);
if (defdir != null) {
writePage(page, contents);
}
if (destsite != null) {
postPage(page, contents);
}
}
System.out.println("Done.");
}
private void writePage(String page, String contents) throws IOException {
BufferedWriter out = new BufferedWriter(new FileWriter(defdir + page));
out.write(contents);
out.close();
}
private void addLinks(List<String> links) {
for (String l : links) {
if (!done.contains(l)) {
todo.add(l);
done.add(l);
}
}
}
private String doregex(String page, Map<String, String> map) {
for (String s : map.keySet()) {
Pattern p = Pattern.compile(s, Pattern.MULTILINE);
Matcher m = p.matcher(page);
page = m.replaceAll(map.get(s));
}
return page;
}
private List<String> getLinks(String page) {
Matcher m = plinks.matcher(page);
List<String> l = new ArrayList<String>();
while (m.find()) {
String s = m.group();
// System.out.println("1 "+s);
s = s.replaceFirst("^<a href=\"" + prefix, "").replaceFirst("\"", "");
if (!s.startsWith("<") && !(s.indexOf(":") > -1)) { // uncreated, offsite, or special link
l.add(s);
// System.out.println("2 "+s);
}
}
return l;
}
private void setup() {
todo = new Vector<String>();
done = new HashSet<String>();
plinks = Pattern.compile("<a.*?href.*?=\"(.*?)\"");
prx = new Hashtable<String, String>();
wrx = new Hashtable<String, String>();
// prx regexes clean up non-content html (header and footer)
prx.put(".*<!-- start content -->", "");
prx.put(".*</script></p>", ""); //toc
prx.put("<!-- Saved in parser cache with key.*", "");
prx.put("<div class=\"printfooter\">.*", "");
prx.put("<div class=\"editsection\".*?</div>", "");
// wrx convert html to markup
wrx.put("<p>(.*?)</p>", "$1\n\n");
wrx.put("<h1>(.*?)</h1>", "\n! $1\n\n");
wrx.put("<h2>(.*?)</h2>", "\n!! $1\n\n");
wrx.put("<h3>(.*?)</h3>", "\n!!! $1\n\n");
wrx.put("<h4>(.*?)</h4>", "n!!!! $1\n\n");
wrx.put("<a name.*?</a>", "");
wrx.put("<a href=\""+prefix+".*?>(.*?)</a>", "[$1]");
wrx.put("<a .*?>", "[");
wrx.put("</a>", "]");
}
public String fetch(String address) throws Exception {
System.out.println("\nfetching " + address);
URL url = new URL(address);
StringBuffer res = new StringBuffer();
BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
String str;
while ((str = in.readLine()) != null) {
res.append(str);
}
in.close();
return res.toString();
}
public void postPage(String page, String contents) {
StringBuffer resp = new StringBuffer();
try {
String data = URLEncoder.encode("ok", ENCODING) + "=" + URLEncoder.encode("Save", ENCODING);
data += "&" + URLEncoder.encode("page", ENCODING) + "=" + URLEncoder.encode(page, ENCODING);
data += "&" + URLEncoder.encode("action", ENCODING) + "=" + URLEncoder.encode("save", ENCODING);
data += "&" + URLEncoder.encode("changenote", ENCODING) + "=" + URLEncoder.encode("uploaded by PageCobbler", ENCODING);
data += "&" + URLEncoder.encode("edittime", ENCODING) + "=" + URLEncoder.encode(""+System.currentTimeMillis(), ENCODING);
data += "&" + URLEncoder.encode("_editedtext", ENCODING) + "=" + URLEncoder.encode(contents, ENCODING);
URL url = new URL(destsite + "Edit.jsp?page=" + page);
URLConnection conn = url.openConnection();
conn.setDoOutput(true);
OutputStreamWriter wr = new OutputStreamWriter(conn.getOutputStream());
wr.write(data);
wr.flush();
BufferedReader rd = new BufferedReader(new InputStreamReader(conn.getInputStream()));
String line;
while ((line = rd.readLine()) != null) {
resp.append(line);
}
wr.close();
rd.close();
if (resp.toString().indexOf("Oops!") > -1) {
String err = "Got an oops on this page: " + page + "; " + resp.toString().replaceFirst(".*?Oops!", "Oops!").replaceFirst("<.*", "");
if (DIE_ON_POST ) {
throw new RuntimeException(err);
} else {
System.err.println(resp.toString());
System.err.println(err);
}
}
} catch (IOException e) {
e.printStackTrace();
System.err.println(resp.toString());
}
}
}
Incidentally, the point of this thing is to be simple and self contained. In my case, I wanted to move off an older mediawiki installation without having to upgrade and set up all the web services, so this did the trick. It should also be easy for anyone to modify for particular purposes, even importing non-wikimedia pages. The main drawback is it thinks pages have been edited after they've been saved once, not sure why that is the case when the timeout has passed?
Comments or pointers to greater works would be appreciated, if you want to encourage this kind of thing?
--davidm, 30-Jan-2007