Changeset 487
- Timestamp:
- 09/09/08 16:17:26 (18 months ago)
- Files:
-
- 1 modified
-
libraries/web/wikipedia.py (modified) (10 diffs)
Legend:
- Unmodified
- Added
- Removed
-
libraries/web/wikipedia.py
r461 r487 427 427 class WikipediaPage: 428 428 429 def __init__(self, title, markup, light=False, html=False):429 def __init__(self, title, markup, light=False, full_strip=True): 430 430 431 431 """ Wikipedia page parser. … … 433 433 The expected markup is the stuff in Wikipedia's edit textarea. 434 434 With light=True, it will onlt parse links to other articles (which is faster). 435 With html=True, it will preserve some HTML markup (links, bold, italic).435 With full_strip=False, it will preserve some HTML markup (links, bold, italic). 436 436 437 437 """ … … 439 439 self.title = title 440 440 self.markup = markup 441 self. html = html441 self.full_strip = full_strip 442 442 443 443 self.disambiguation = [] … … 546 546 Does some commonsense maintenance as well, 547 547 like collapsing multiple spaces. 548 If you specified html=True for WikipediaPage instance,548 If you specified full_strip=False for WikipediaPage instance, 549 549 some markup is preserved as HTML (links, bold, italic). 550 550 … … 552 552 553 553 # Strip bold and italic. 554 if not self.html:554 if self.full_strip: 555 555 markup = markup.replace("'''", "") 556 556 markup = markup.replace("''", "") … … 569 569 # Strip links, keeping the display alias. 570 570 # We'll strip the ending ]] later. 571 if not self.html:571 if self.full_strip: 572 572 markup = re.sub(r"\[\[[^\]]*?\|", "", markup) 573 573 else: … … 626 626 627 627 # Strip all HTML except <math> tags. 628 if not self.html:628 if self.full_strip: 629 629 markup = strip_tags(markup, exclude=["math"], linebreaks=True) 630 630 … … 1547 1547 1548 1548 def __init__(self, q, language="en", light=False, wait=10, asynchronous=False, cached=True, 1549 case_sensitive=False, html=False):1549 case_sensitive=False, full_strip=True): 1550 1550 1551 1551 """ A download manager for Wikipedia pages. … … 1561 1561 1562 1562 self._light = light 1563 self._ html = html1563 self._full_strip = full_strip 1564 1564 1565 1565 if cached: … … 1587 1587 data = "" 1588 1588 1589 WikipediaPage.__init__(self, title, data, light=self._light, html=self._html)1589 WikipediaPage.__init__(self, title, data, light=self._light, full_strip=self._full_strip) 1590 1590 1591 1591 def search(q, language="en", light=False, wait=10, asynchronous=False, cached=True, 1592 case_sensitive=False, html=False):1593 return WikipediaSearch(q, language, light, wait, asynchronous, cached, case_sensitive, html)1592 case_sensitive=False, full_strip=True): 1593 return WikipediaSearch(q, language, light, wait, asynchronous, cached, case_sensitive, full_strip) 1594 1594 1595 1595 ######################################################################################################
