1 package org.varienaja.util.wikipedia;
2
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.net.MalformedURLException;
6 import java.net.URL;
7 import java.net.URLConnection;
8 import java.util.HashMap;
9 import java.util.LinkedList;
10 import java.util.List;
11 import java.util.Map;
12 import java.util.regex.Matcher;
13 import java.util.regex.Pattern;
14
15 import org.apache.log4j.Logger;
16 import org.varienaja.util.LRUMap;
17 import org.varienaja.util.StringUtil;
18 import org.w3c.dom.Document;
19 import org.w3c.dom.NamedNodeMap;
20 import org.w3c.dom.Node;
21 import org.w3c.dom.NodeList;
22 import org.xml.sax.InputSource;
23 import org.xml.sax.SAXException;
24
25 import com.sun.org.apache.xerces.internal.parsers.DOMParser;
26
27
28
29
30
31 public class WikipediaSearcher {
32 private static final Logger log = Logger.getLogger(WikipediaSearcher.class);
33
34
35
36
37 private static final String BASEURL = "http://en.wikipedia.org/w/api.php?action=query&prop=links&format=xml&pllimit=500&titles=";
38 private static final String CONTENTURL = "http://en.wikipedia.org/w/api.php?action=query&prop=revisions&format=xml&rvprop=content&titles=";
39 private static final String NORMALURL = "http://en.wikipedia.org/wiki/";
40
41
42
43
44 private static Map<String,String[]> _keywordcache = new HashMap<String,String[]>(10);
45
46
47
48 private static Map<String,String> _bandnamecache = new LRUMap<String,String>("WikipediaBandEntries",10);
49
50
51
52
53
54
55
56
57
58 public static String[] getKeywords (String bandname, String playlistname) {
59 String[] cached = _keywordcache.get(bandname+" / "+playlistname);
60 if (cached==null) {
61 log.debug("Cache miss for: "+bandname+", playlist: "+playlistname+". Searching Wikipedia.org");
62 cached = getKeywordsInternal(bandname,playlistname);
63 _keywordcache.put(bandname+" / "+playlistname,cached);
64 } else {
65 log.debug("Cache hit for: "+bandname+", playlist: "+playlistname);
66 }
67 return cached;
68 }
69
70
71
72
73
74
75
76
77
78 protected static String getBandURL(String bandname) throws WikipediaException {
79 String cached = _bandnamecache.get(bandname);
80 if (cached==null) {
81 log.debug("Cache miss for: "+bandname+". Searching Wikipedia.org");
82 Document doc = null;
83 List<String> links = null;
84 String candidate = bandname;
85
86
87 doc = queryWikipedia(getSearchURL(candidate));
88 if (doc==null) {
89 throw new WikipediaException("Wikipedia unavailable? Check your internet connection.");
90 }
91
92
93 links = getLinksFromDocument(doc);
94
95
96 if (links.size()==1 || links.contains("Wikipedia:Redirect")) {
97 candidate = links.get(0);
98 doc = queryWikipedia(getSearchURL(candidate));
99 links = getLinksFromDocument(doc);
100 }
101
102 String origCandidate = candidate;
103 candidate = bandname+" (band)";
104 if (StringUtil.contains(links,candidate)) {
105 _bandnamecache.put(bandname, candidate);
106 return candidate;
107 }
108 candidate = bandname+" (entertainer)";
109 if (StringUtil.contains(links,candidate)) {
110 _bandnamecache.put(bandname, candidate);
111 return candidate;
112 } else {
113 candidate = bandname+" (disambiguation)";
114 if (StringUtil.contains(links,candidate)) {
115 doc = queryWikipedia(getSearchURL(candidate));
116 links = getLinksFromDocument(doc);
117 candidate = bandname+" (band)";
118 if (StringUtil.contains(links,candidate)) {
119 _bandnamecache.put(bandname, candidate);
120 return candidate;
121 }
122 }
123 }
124 _bandnamecache.put(bandname, origCandidate);
125 return origCandidate;
126 } else {
127 log.debug("Cache hit for: "+bandname+".");
128 return cached;
129 }
130 }
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157 private static String[] getKeywordsInternal(String bandname, String playlistname) {
158 String[] result = new String[0];
159 Document doc = null;
160 List<String> results = null;
161
162
163 try {
164 String url = getBandURL(bandname);
165 log.debug("Wikipedia URL for band: " + bandname+" is: "+url);
166 doc = queryWikipedia(getSearchURL(url));
167 } catch (WikipediaException e) {
168 log.error(e.toString());
169 return result;
170 }
171
172
173 results = getLinksFromDocument(doc);
174
175 if (StringUtil.contains(results,bandname+" discography")) {
176 doc = queryWikipedia(getSearchURL(bandname+" discography"));
177 results = getLinksFromDocument(doc);
178 }
179 if (StringUtil.contains(results,playlistname+" (album)")) {
180 doc = queryWikipedia(getContentURL(playlistname+" (album)"));
181 } else {
182 if (StringUtil.contains(results,playlistname)) {
183 doc = queryWikipedia(getContentURL(playlistname));
184 } else {
185 return result;
186 }
187 }
188
189
190 String content = getContentFromDocument(doc);
191 result = extractGenrePart(content);
192 return result;
193 }
194
195
196
197
198
199
200
201 protected static Document getDocumentFromInputStream(InputStream in) {
202 InputSource source = new InputSource(in);
203 DOMParser parser = new DOMParser();
204 try {
205 parser.parse(source);
206 Document doc = parser.getDocument();
207 return doc;
208 } catch (SAXException e) {
209 log.error("Error processing InputStream: "+e.getMessage());
210 } catch (IOException e) {
211 log.error("Error processing InputStream: "+e.getMessage());
212 }
213 return null;
214 }
215
216
217
218
219
220
221
222
223
224 protected static List<String> getLinks(InputStream in) {
225 Document doc = getDocumentFromInputStream(in);
226 return doc==null ? null : getLinksFromDocument(doc);
227 }
228
229
230
231
232
233
234
235
236
237 protected static List<String> getLinksFromDocument(Document doc) {
238 List<String> results = new LinkedList<String>();
239 NodeList list = doc.getElementsByTagName("links");
240 for (int i=0; i<list.getLength(); i++) {
241 Node node = list.item(i);
242 NodeList children = node.getChildNodes();
243 for (int j=0; j<children.getLength(); j++) {
244 Node child = children.item(j);
245 NamedNodeMap map = child.getAttributes();
246 Node title = map.getNamedItem("title");
247 results.add(title.getTextContent());
248 }
249 }
250 return results;
251 }
252
253
254
255
256
257
258 protected static String getContentFromDocument(Document doc) {
259 StringBuilder sb = new StringBuilder();
260 NodeList list = doc.getElementsByTagName("rev");
261 for (int i=0; i<list.getLength(); i++) {
262 Node node = list.item(i);
263 sb.append(node.getTextContent());
264 }
265 return sb.toString();
266 }
267
268
269
270
271
272
273 protected static String getSearchURL(String query) {
274 return (BASEURL+query).replaceAll(" ","%20");
275 }
276
277
278
279
280
281
282 protected static String getContentURL(String query) {
283 return (CONTENTURL+query).replaceAll(" ","%20");
284 }
285
286
287
288
289
290
291
292 protected static Document queryWikipedia(String location) {
293 try {
294 URL url = new URL(location);
295 URLConnection urlc = url.openConnection();
296 urlc.setRequestProperty("user-agent","Mozilla/5.0");
297 InputStream in = urlc.getInputStream();
298 InputSource source = new InputSource(in);
299 DOMParser parser = new DOMParser();
300 parser.parse(source);
301 in.close();
302 return parser.getDocument();
303 } catch (MalformedURLException e) {
304 log.error("Error searching Wikipedia: "+e.getMessage());
305 } catch (IOException e) {
306 log.error("Error searching Wikipedia: "+e.getMessage());
307 } catch (SAXException e) {
308 log.error("Error searching Wikipedia: "+e.getMessage());
309 }
310 return null;
311 }
312
313
314
315
316
317
318
319
320 protected static String[] extractGenrePart(String content) {
321 Pattern pattern = Pattern.compile("Genre\\s*=\\s*(\\[\\[.*?\\]\\]).?\\s\\|");
322 Matcher matcher = pattern.matcher(content);
323 if (matcher.find()) {
324 if (log.isDebugEnabled()) {
325 for (int i=0;i<=matcher.groupCount();i++) {
326 log.debug(matcher.group(i));
327 }
328 }
329 if (matcher.groupCount()==1) {
330 return extractGenres(matcher.group(1));
331 }
332 }
333 return new String[0];
334 }
335
336
337
338
339
340
341
342
343
344
345 protected static String[] extractGenres(String genrespart) {
346 if (genrespart==null) {
347 return null;
348 }
349
350 List<String> result = new LinkedList<String>();
351 String[] items = genrespart.split(",\\s*|<br>|/");
352 for (int i=0;i<items.length;i++) {
353 int start = items[i].indexOf("|");
354 if (start==-1) {
355 start = items[i].lastIndexOf("[");
356 }
357 int end = items[i].indexOf("]");
358 if (end!=-1) {
359 result.add(items[i].substring(start+1,end).replaceAll("&","&"));
360 }
361 }
362
363 return result.toArray(new String[result.size()]);
364 }
365
366
367
368
369
370
371
372
373
374 public static String getBandInfo(String bandname) throws WikipediaException {
375 String url;
376 try {
377 url = getContentURL(getBandURL(bandname));
378 Document doc = queryWikipedia(url);
379 String content = getContentFromDocument(doc);
380
381
382 int end = content.indexOf("==");
383 if (end==-1) {
384 throw new WikipediaException("Wikipediacontent for band: "+bandname+" could not be deciphered");
385 }
386 content = content.substring(0, end);
387
388
389 StringBuffer sb = new StringBuffer();
390 int depth = 0;
391 for (char c : content.toCharArray()) {
392 if (c=='{') {
393 depth++;
394 } else if (c=='}') {
395 depth--;
396 } else if (depth==0) {
397 sb.append(c);
398 }
399 }
400
401
402
403 content = sb.toString();
404 content = WikiContentHelper.removeComment(content);
405 content = WikiContentHelper.removeImages(content);
406 content = content.trim();
407 end = content.indexOf("\n");
408 if (end!=-1) {
409 content = content.substring(0, end);
410 }
411
412
413 content = WikiContentHelper.removeLinks(content);
414 content = WikiContentHelper.removeRefs(content);
415 content = WikiContentHelper.sanitizeQuotes(content);
416
417 return content.trim();
418 } catch (WikipediaException e) {
419 throw new WikipediaException("No info found for band: "+bandname, e);
420 }
421 }
422
423
424
425
426
427
428
429 public static String getExternalBandURL(String bandname) {
430 String url = null;
431 try {
432 url = NORMALURL + getBandURL(bandname);
433 } catch (WikipediaException e) {
434 log.error("Could not create the Wikipedia URL for band: " + bandname+" : "+e);
435 }
436 return url;
437 }
438
439
440 }