""" soupselect.py CSS selector support for BeautifulSoup. soup = BeautifulSoup('...') select(soup, 'div') - returns a list of div elements select(soup, 'div#main ul a') - returns a list of links inside a ul inside div#main """ import re tag_re = re.compile('^[a-z0-9]+$') attribselect_re = re.compile( r'^(?P\w+)?\[(?P\w+)(?P[=~\|\^\$\*]?)' + r'=?"?(?P[^\]"]*)"?\]$' ) # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ # \---/ \---/\-------------/ \-------/ # | | | | # | | | The value # | | ~,|,^,$,* or = # | Attribute # Tag def attribute_checker(operator, attribute, value=''): """ Takes an operator, attribute and optional value; returns a function that will return True for elements that match that combination. """ return { '=': lambda el: el.get(attribute) == value, # attribute includes value as one of a set of space separated tokens '~': lambda el: value in el.get(attribute, '').split(), # attribute starts with value '^': lambda el: el.get(attribute, '').startswith(value), # attribute ends with value '$': lambda el: el.get(attribute, '').endswith(value), # attribute contains value '*': lambda el: value in el.get(attribute, ''), # attribute is either exactly value or starts with value- '|': lambda el: el.get(attribute, '') == value \ or el.get(attribute, '').startswith('%s-' % value), }.get(operator, lambda el: el.has_key(attribute)) def select(soup, selector): """ soup should be a BeautifulSoup instance; selector is a CSS selector specifying the elements you want to retrieve. """ tokens = selector.split() current_context = [soup] for token in tokens: m = attribselect_re.match(token) if m: # Attribute selector tag, attribute, operator, value = m.groups() if not tag: tag = True checker = attribute_checker(operator, attribute, value) found = [] for context in current_context: found.extend([el for el in context.findAll(tag) if checker(el)]) current_context = found continue if '#' in token: # ID selector tag, id = token.split('#', 1) if not tag: tag = True el = current_context[0].find(tag, {'id': id}) if not el: return [] # No match current_context = [el] continue if '.' in token: # Class selector tag, klass = token.split('.', 1) if not tag: tag = True found = [] for context in current_context: found.extend( context.findAll(tag, {'class': lambda attr: attr and klass in attr.split()} ) ) current_context = found continue if token == '*': # Star selector found = [] for context in current_context: found.extend(context.findAll(True)) current_context = found continue # Here we should just have a regular tag if not tag_re.match(token): return [] found = [] for context in current_context: found.extend(context.findAll(token)) current_context = found return current_context def monkeypatch(BeautifulSoupClass=None): """ If you don't explicitly state the class to patch, defaults to the most common import location for BeautifulSoup. """ if not BeautifulSoupClass: from BeautifulSoup import BeautifulSoup as BeautifulSoupClass BeautifulSoupClass.findSelect = select def unmonkeypatch(BeautifulSoupClass=None): if not BeautifulSoupClass: from BeautifulSoup import BeautifulSoup as BeautifulSoupClass delattr(BeautifulSoupClass, 'findSelect')