View Javadoc

1   package org.varienaja.util.wikipedia;
2   
3   import java.io.IOException;
4   import java.io.InputStream;
5   import java.net.MalformedURLException;
6   import java.net.URL;
7   import java.net.URLConnection;
8   import java.util.HashMap;
9   import java.util.LinkedList;
10  import java.util.List;
11  import java.util.Map;
12  import java.util.regex.Matcher;
13  import java.util.regex.Pattern;
14  
15  import org.apache.log4j.Logger;
16  import org.varienaja.util.LRUMap;
17  import org.varienaja.util.StringUtil;
18  import org.w3c.dom.Document;
19  import org.w3c.dom.NamedNodeMap;
20  import org.w3c.dom.Node;
21  import org.w3c.dom.NodeList;
22  import org.xml.sax.InputSource;
23  import org.xml.sax.SAXException;
24  
25  import com.sun.org.apache.xerces.internal.parsers.DOMParser;
26  
27  /**
28   * Class that provides functions to search in Wikipedia.
29   * @author Varienaja
30   */
31  public class WikipediaSearcher {
32  	private static final Logger log = Logger.getLogger(WikipediaSearcher.class);
33  	
34  	/**
35  	 * Base URL for searching Wikipedia. This URL return all links as xml.
36  	 */
37  	private static final String BASEURL = "http://en.wikipedia.org/w/api.php?action=query&prop=links&format=xml&pllimit=500&titles=";
38  	private static final String CONTENTURL = "http://en.wikipedia.org/w/api.php?action=query&prop=revisions&format=xml&rvprop=content&titles=";
39  	private static final String NORMALURL = "http://en.wikipedia.org/wiki/";
40  	
41  	/**
42  	 * Cache for bandname/playlist --> genres.
43  	 */
44  	private static Map<String,String[]> _keywordcache = new HashMap<String,String[]>(10);
45  	/**
46  	 * Cache for bandnames --> bandurl.
47  	 */
48  	private static Map<String,String> _bandnamecache = new LRUMap<String,String>("WikipediaBandEntries",10);
49  
50  	/**
51  	 * <p>Searches Wikipedia for Genre(s) of a specific playlist. Results of
52  	 * this method are cached, so you can call this method safely and
53  	 * performantly for all songs of a playlist.</p>
54  	 * @param bandname The name of the Band
55  	 * @param playlistname The name of the Playlist
56  	 * @return An array of Strings, describing the Genre(s) of the Playlist
57  	 */
58  	public static String[] getKeywords (String bandname, String playlistname) {
59  		String[] cached = _keywordcache.get(bandname+" / "+playlistname);
60  		if (cached==null) {
61  			log.debug("Cache miss for: "+bandname+", playlist: "+playlistname+". Searching Wikipedia.org");
62  			cached = getKeywordsInternal(bandname,playlistname);
63  			_keywordcache.put(bandname+" / "+playlistname,cached);
64  		} else {
65  			log.debug("Cache hit for: "+bandname+", playlist: "+playlistname);
66  		}
67  		return cached;
68  	}
69  	
70  	/**
71  	 * Searches Wikipedia for the URL of the given band.
72  	 * TODO Refactor repetitive code.
73  	 * @param bandname The Band to search for.
74  	 * @return The String that can be appended to the stardard-Wikipedia URL to
75  	 * open the page for the Band. The result is null, of no band was found.
76  	 * @throws WikipediaException When searching went wrong somehow.
77  	 */
78  	protected static String getBandURL(String bandname) throws WikipediaException {
79  		String cached = _bandnamecache.get(bandname);
80  		if (cached==null) {
81  			log.debug("Cache miss for: "+bandname+". Searching Wikipedia.org");
82  			Document doc = null;
83  			List<String> links = null;
84  			String candidate = bandname;
85  			
86  			//Step 1
87  			doc = queryWikipedia(getSearchURL(candidate));
88  			if (doc==null) {
89  				throw new WikipediaException("Wikipedia unavailable? Check your internet connection.");
90  			}
91  			
92  			//Step 2
93  			links = getLinksFromDocument(doc);
94  			
95  			//If we only get or an explicit Wikipedia:Redirect result, we're probably being redirected: follow the first link.
96  			if (links.size()==1 || links.contains("Wikipedia:Redirect")) {
97  				candidate = links.get(0);
98  				doc = queryWikipedia(getSearchURL(candidate));
99  				links = getLinksFromDocument(doc);
100 			}
101 			
102 			String origCandidate = candidate;
103 			candidate = bandname+" (band)";
104 			if (StringUtil.contains(links,candidate)) { //Try (band)-suffix
105 				_bandnamecache.put(bandname, candidate);
106 				return candidate;
107 			}
108 				candidate = bandname+" (entertainer)";
109 				if (StringUtil.contains(links,candidate)) {
110 					_bandnamecache.put(bandname, candidate);
111 					return candidate;
112 				} else {
113 					candidate = bandname+" (disambiguation)";
114 					if (StringUtil.contains(links,candidate)) {
115 						doc = queryWikipedia(getSearchURL(candidate));
116 						links = getLinksFromDocument(doc);
117 						candidate = bandname+" (band)";
118 						if (StringUtil.contains(links,candidate)) {
119 							_bandnamecache.put(bandname, candidate);
120 							return candidate;
121 						}
122 					}
123 				}
124 			_bandnamecache.put(bandname, origCandidate);
125 			return origCandidate;
126 		} else {
127 			log.debug("Cache hit for: "+bandname+".");
128 			return cached;
129 		}
130 	}
131 	
132 	/**
133 	 * <p>Searches Wikipedia for Genre(s) of a specific playlist. The strategy
134 	 * is as follows:</p>
135 	 * <ol>
136 	 *    <li>We search for "bandname"</li>
137 	 *    <li>If the resulting Document has no links, we try "bandname (band)".
138 	 *        If the resulting Document has a link "bandname (disambiguation)"
139 	 *        or "bandname (entertainer)" we will follow that one and do this
140 	 *        step again.<br/>
141 	 *        If the resulting Document has a link "bandname (band)", we will
142 	 *        follow that link.</br>
143 	 *        Otherwise, wo do nothing. (We're probably right where we want to
144 	 *        be.)</li>
145 	 *    <li>We scan for a link to "bandname discograpy", and follow it if
146 	 *        present. Then, we scan for a link to the playlist or to 
147 	 *        "playlist (album) on this page. If it is present we follow this
148 	 *        link. If there isn't a link to a Playlist, we return null.
149 	 *        </li>
150 	 *    <li>We are on the page of the Playlist. We search for the genres and
151 	 *        return them.</li> 
152 	 * </ol> 
153 	 * @param bandname The name of the Band
154 	 * @param playlistname The name of the Playlist
155 	 * @return An array of Strings, describing the Genre(s) of the Playlist
156 	 */
157 	private static String[] getKeywordsInternal(String bandname, String playlistname) {
158 		String[] result = new String[0];
159 		Document doc = null;
160 		List<String> results = null;
161 		
162 		//Step 1+2
163 		try {
164 			String url = getBandURL(bandname);
165 			log.debug("Wikipedia URL for band: " + bandname+" is: "+url);
166 			doc = queryWikipedia(getSearchURL(url));
167 		} catch (WikipediaException e) {
168 			log.error(e.toString());
169 			return result; //Return immediately, if communication with Wikipedia failed
170 		}
171 		
172 		//Step 3
173 		results = getLinksFromDocument(doc);
174 		//TODO Create a contains that returns the item closest to the query?
175 		if (StringUtil.contains(results,bandname+" discography")) {
176 			doc = queryWikipedia(getSearchURL(bandname+" discography"));
177 			results = getLinksFromDocument(doc);
178 		}
179 		if (StringUtil.contains(results,playlistname+" (album)")) {
180 			doc = queryWikipedia(getContentURL(playlistname+" (album)"));
181 		} else {
182 			if (StringUtil.contains(results,playlistname)) {
183 				doc = queryWikipedia(getContentURL(playlistname));
184 			} else {
185 				return result;
186 			}
187 		}
188 		
189 		//Step 4
190 		String content = getContentFromDocument(doc);
191 		result = extractGenrePart(content);
192 		return result;
193 	}
194 	
195 	/**
196 	 * Returns the Document object which was created from the xml in the
197 	 * inputstream.
198 	 * @param in The InputStream containing XML-data
199 	 * @return The w3c.dom.Document
200 	 */
201 	protected static Document getDocumentFromInputStream(InputStream in) {
202 		InputSource source = new InputSource(in);
203 		DOMParser parser = new DOMParser();
204 		try {
205 			parser.parse(source);
206 			Document doc = parser.getDocument();
207 			return doc;
208 		} catch (SAXException e) {
209 			log.error("Error processing InputStream: "+e.getMessage());
210 		} catch (IOException e) {
211 			log.error("Error processing InputStream: "+e.getMessage());
212 		}
213 		return null;
214 	}
215 	
216 	/**
217 	 * Returns all links that are present in a certain InputStream. You should
218 	 * only call this method with InputStream-parameters with xml-data in them
219 	 * that comes from Wikipedia.org
220 	 * @param in The InputStream containing XML-data
221 	 * @return All (wikipedia)links that are found in the InputStream, or null
222 	 *         if an error occured.
223 	 */
224 	protected static List<String> getLinks(InputStream in) {
225 		Document doc = getDocumentFromInputStream(in);
226 		return doc==null ? null : getLinksFromDocument(doc);
227 	}
228 	
229 	/**
230 	 * Returns all links that are present in a Document. You should
231 	 * only call this method with Document-parameters with data in them
232 	 * that comes from Wikipedia.org
233 	 * TODO Use XPath?
234 	 * @param doc The document
235 	 * @return All (wikipedia)links that are found in the Document
236 	 */
237 	protected static List<String> getLinksFromDocument(Document doc) {
238 		List<String> results = new LinkedList<String>();
239 		NodeList list = doc.getElementsByTagName("links");
240 		for (int i=0; i<list.getLength(); i++) {
241 			Node node = list.item(i);
242 			NodeList children = node.getChildNodes();
243 			for (int j=0; j<children.getLength(); j++) {
244 				Node child = children.item(j);
245 				NamedNodeMap map = child.getAttributes();
246 				Node title = map.getNamedItem("title");
247 				results.add(title.getTextContent());
248 			}
249 		}
250 		return results;
251 	}
252 	
253 	/**
254 	 * Returns the content-element of an XMLDocument.
255 	 * @param doc The Document
256 	 * @return Everything between &lt;content&gt; and &lt;/content&gt;
257 	 */
258 	protected static String getContentFromDocument(Document doc) {
259 		StringBuilder sb = new StringBuilder();
260 		NodeList list = doc.getElementsByTagName("rev");
261 		for (int i=0; i<list.getLength(); i++) {
262 			Node node = list.item(i);
263 			sb.append(node.getTextContent());
264 		}
265 		return sb.toString();
266 	}
267 	
268 	/**
269 	 * Constructs a URL for searching Wikipedia
270 	 * @param query The query
271 	 * @return The URL
272 	 */
273 	protected static String getSearchURL(String query) {
274 		return (BASEURL+query).replaceAll(" ","%20");
275 	}
276 	
277 	/**
278 	 * Constructs a URL for getting contents from Wikipedia
279 	 * @param query The query
280 	 * @return The URL
281 	 */
282 	protected static String getContentURL(String query) {
283 		return (CONTENTURL+query).replaceAll(" ","%20");
284 	}
285 	
286 	/**
287 	 * Searches Wikipedia, and returns the resulting page as a
288 	 * completely parsed XMLDocument
289 	 * @param location The URL
290 	 * @return The Document, containing the links of the page.
291 	 */
292 	protected static Document queryWikipedia(String location) {
293 		try {
294 			URL url = new URL(location);
295 			URLConnection urlc = url.openConnection();
296 			urlc.setRequestProperty("user-agent","Mozilla/5.0"); //TODO See if this is nessecary
297 			InputStream in = urlc.getInputStream();
298 			InputSource source = new InputSource(in);
299 			DOMParser parser = new DOMParser();
300 			parser.parse(source);
301 			in.close();
302 			return parser.getDocument();
303 		} catch (MalformedURLException e) {
304 			log.error("Error searching Wikipedia: "+e.getMessage());
305 		} catch (IOException e) {
306 			log.error("Error searching Wikipedia: "+e.getMessage());
307 		} catch (SAXException e) {
308 			log.error("Error searching Wikipedia: "+e.getMessage());
309 		}
310 		return null;
311 	}
312 	
313 	/**
314 	 * Searches for a substring like 
315 	 * "Genre       =[[Rhythm and blues|R&amp;B]], [[Funk]], [[Rock music|Rock]]"
316 	 * in the content.
317 	 * @param content The String to search in
318 	 * @return A String Array, containing the Keywords found.
319 	 */
320 	protected static String[] extractGenrePart(String content) {
321 		Pattern pattern = Pattern.compile("Genre\\s*=\\s*(\\[\\[.*?\\]\\]).?\\s\\|");
322 		Matcher matcher = pattern.matcher(content);
323 		if (matcher.find()) {
324 			if (log.isDebugEnabled()) {
325 				for (int i=0;i<=matcher.groupCount();i++) {
326 					log.debug(matcher.group(i));
327 				}
328 			}
329 			if (matcher.groupCount()==1) {
330 				return extractGenres(matcher.group(1));
331 			}
332 		}
333 		return new String[0];
334 	}
335 	
336 	/**
337 	 * <p>Processes a String like "[[Rhythm and blues|R&amp;B]], [[Funk]], [[Rock music|Rock]]"
338 	 * and creates a String Array, containing only the text.</p>
339 	 * <p>[[X]] becomes X<br/>
340 	 * [[X|Y]] becomes Y</p>
341 	 * TODO Create unittests for this method, and use a different method for the splitting
342 	 * @param genrespart The inputstring
343 	 * @return An Array of String containing the Keywords found in the inputstring.
344 	 */
345 	protected static String[] extractGenres(String genrespart) {
346 		if (genrespart==null) {
347 			return null;
348 		}
349 		
350 		List<String> result = new LinkedList<String>();
351 		String[] items = genrespart.split(",\\s*|<br>|/");
352 		for (int i=0;i<items.length;i++) {
353 			int start = items[i].indexOf("|");
354 			if (start==-1) {
355 				start = items[i].lastIndexOf("[");
356 			}
357 			int end = items[i].indexOf("]");
358 			if (end!=-1) {
359 				result.add(items[i].substring(start+1,end).replaceAll("&amp;","&"));
360 			}
361 		}
362 		
363 		return result.toArray(new String[result.size()]);
364 	}
365 	
366 	/**
367 	 * Retrieves Band info from Wikipedia. Bandinfo is just the general contents
368 	 * of the Wikipedia-entry.
369 	 * @param bandname
370 	 * @return The first paragraph of the Wikipedia entry for the given Band.
371 	 * @throws WikipediaException When no info was found or the contents returned
372 	 * could not be deciphered.
373 	 */
374 	public static String getBandInfo(String bandname) throws WikipediaException {
375 		String url;
376 		try {
377 			url = getContentURL(getBandURL(bandname));
378 			Document doc = queryWikipedia(url);
379 			String content = getContentFromDocument(doc);
380 			
381 			//We now get the text before the first occurrence of '=='
382 			int end = content.indexOf("==");
383 			if (end==-1) {
384 				throw new WikipediaException("Wikipediacontent for band: "+bandname+" could not be deciphered");
385 			}
386 			content = content.substring(0, end);
387 			
388 			//Now, remove everything between {{ and }}, {{}} inclusive.
389 			StringBuffer sb = new StringBuffer();
390 			int depth = 0;
391 			for (char c : content.toCharArray()) {
392 				if (c=='{') {
393 					depth++;
394 				} else if (c=='}') {
395 					depth--;
396 				} else if (depth==0) {
397 					sb.append(c);
398 				}
399 			}
400 			
401 			//Once we got the first part of the text successfully, we now chop-off
402 			//everything before \n, if possible.
403 			content = sb.toString();
404 			content = WikiContentHelper.removeComment(content);
405 			content = WikiContentHelper.removeImages(content);
406 			content = content.trim();
407 			end = content.indexOf("\n");
408 			if (end!=-1) {
409 				content = content.substring(0, end);
410 			}
411 			
412 			//Now, we have to clean-up all wiki-links that are left over.
413 			content = WikiContentHelper.removeLinks(content);
414 			content = WikiContentHelper.removeRefs(content);
415 			content = WikiContentHelper.sanitizeQuotes(content);
416 			
417 			return content.trim();
418 		} catch (WikipediaException e) {
419 			throw new WikipediaException("No info found for band: "+bandname, e);
420 		}
421 	}
422 	
423 	/**
424 	 * Get the Wikipedia URL for a Band.
425 	 * @param bandname The name of the band.
426 	 * @return The (external) URL, linking to the Wikipedia-entry for the Band.
427 	 * The result is null if no URL could be found.
428 	 */
429 	public static String getExternalBandURL(String bandname) {
430 		String url = null;
431 		try {
432 			url = NORMALURL + getBandURL(bandname);
433 		} catch (WikipediaException e) {
434 			log.error("Could not create the Wikipedia URL for band: " + bandname+" : "+e);
435 		}
436 		return url;
437 	}
438 	
439 	
440 }