User:Mutley1989/Scripts: Difference between revisions

Content deleted Content added

Inline

Revision as of 18:45, 1 March 2013

Some scripts that I have written for tasks on wikipedia, and to learn both how to work programmatically with wikipedia, and become more familliar with pywikipediabot. Comments, criticism, questions, suggestions etc. welcome.

Python script to find links incorrectly tagged with disambiguation templates, used in response to this request. Generates a lot of false positives and therefore the results need manual inspection and editing. One possible improvement would be testing if the link tagged with {{dn}} has changed since it was tagged, although this would obviously miss instances where the destination page has been changed from a disambiguation page. Depends on pywikipediabot.

#!/usr/bin/python

import re
import wikipedia, catlib, pagegenerators
import webbrowser


def get_disam_links(page):
    """
    Returns a list of linked page title that have
    a {{Disambiguation Needed}} template from a given page.
    """
    disam_re = re.compile(r"\{\{Disambiguation Needed(\|date=|\}\})|" +
            r"\{\{dn(\|date=|\}\})", re.I)
    res = []
    found = disam_re.search(page)
    while found:
        try:
            link_start = page.rindex("[[", 0, found.start())
        except ValueError:
            return []
        link_end = min(page.index("|", link_start),
                page.index("]]", link_start))
        res.append(page[link_start + 2:link_end])
        found = disam_re.search(page, found.end())
    disam_dep_re = re.compile(
            r"\{\{Disambiguation Needed\|(?!date=)[^|}]*(\|[^|}]*)?(\|date=[^}]*)?\}\}|" +
            r"\{\{dn\|(?!date=)[^|}]*(\|[^|}]*)?(\|date=[^}]*)?\}\}",
            re.I)
    found_dep = disam_dep_re.search(page)
    while found_dep:
        res.append(found_dep.group().strip("{}").split("|")[1])
        found_dep = disam_re.search(page, found_dep.end())
    return res

def find_fulfilled_dn_templates(category_title, start=None):
    """
    Returns a list of wikipedia.Page objects that have {{dn}} templates
    preceded by, or containing a link that doesn't lead to a Disambiguation
    page
    """
    site = wikipedia.getSite()
    category = catlib.Category(site, category_title)
    catgen = pagegenerators.CategorizedPageGenerator(category, start=start)
    res = []
    try:
        for article in catgen:
            exists = False
            print "\nPAGE",article
            link_titles = get_disam_links(article.get())
            for link in link_titles:
                link_page = wikipedia.Page(site, link)
                print link_page
                while link_page.isRedirectPage():
                    link_page = link_page.getRedirectTarget()
                    print "redirecting", link_page
                if link_page.exists() and not link_page.isDisambig():
                    print "***********true**********"
                    exists = True
                else:
                    print "false"
            if exists:
                res.append(article)
    except:
        import traceback
        traceback.print_exc()
        return res
    return res

Python script written for this request. Depends on pywikipediabot and the infobox script below.

#!/usr/bin/python

import infobox
import wikipedia

def get_languages():
    """Hackish and fragile, any changes to the page will probably break it"""
    site = wikipedia.getSite()
    langs = wikipedia.Page(site, "Wikipedia:WikiProject Languages/Primary language names in Ethnologue 16 by ISO code").get()
    langs = langs[langs.find("[[", langs.find("==Codes==")):
            langs.rfind("]]", 0, langs.find("</tt>")) + 2]
    language_list = [lang.strip("[]") for lang in langs.split("\n")]
    return [tuple(lang.split("|")) for lang in language_list]

def check_languages(start=None, end=None):
    res = []
    disams = []
    misc = []
    site = wikipedia.getSite()
    for language in get_languages()[start:end]:
        try:
            lang_page = wikipedia.Page(site, language[0])
            if lang_page.exists():
                while lang_page.isRedirectPage():
                    lang_page = lang_page.getRedirectTarget()
                if lang_page.isDisambig():
                    disams.append(language)
#                    print "disambiguation", language
                    continue
                try:
                    parsed_infobox = infobox.infobox_parse(lang_page)
                except Exception:
#                    print "parse error", language
                    misc.append(language)
                    continue
                params = [parsed_infobox[key] for key in parsed_infobox
                        if key.startswith("lc") or key == "iso3"]
                if all(param != language[1] for param in params):
#                    print "param", language
                    res.append(language)
        except Exception:
#            print "other error", language
            misc.append(language)
    return res, disams, misc

Python script to extract the first infobox from a page, and return a dict of the parameters and their values. Only tested on simple infoboxes, probably fails on some others. Depends on pywikipediabot.

#!usr/bin/python
# Adapted from:
# http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/

import re
import sys
import wikipedia

def get_infobox_from_text(article_text):
    #Build a regexp to get the source artery from the artery infobox
    exp = r'\{\{'                  # the opening brackets for the infobox 
    exp = exp + r'\s*'           # any amount of whitespace
    exp = exp + r'[Ii]nfobox +'  # the word "infobox", capitalized or not followed by at least one space
#    if box_title:
#        exp = exp + box_title     # the infobox title, capitalized or not
#        exp = exp + r'\s*\|'         # any number of spaces or returns followed by a pipe character
    exp = exp + r'.*'           # a bunch of other stuff in the infobox  
    exp3 = exp                  # save the regexp so far so that I can use it later
    exp3 = exp3 + r'.*\}\}'          # any amount of anything, followed by the end of the infobox

    exp3_obj = re.compile(exp3, re.DOTALL)
    search_result = exp3_obj.search(article_text)
    if search_result:
        result_text = search_result.group(0) # returns the entire matching sequence
    else:
        return None
    # the regex isn't perfect, so look for the closing brackets of the infobox
    count = 0
    last_ind = None
    for ind, c in enumerate(result_text):
        if c == '}':
            count = count -1
        elif c == '{':
            count = count +1
        if count == 0 and not ind == 0:
            last_ind = ind
            break
    return result_text[0:last_ind+1]

def parse_infobox_text(text):
    text = text.split('|')
    text = text[1:] #everything before the first pipe is the infobox declaration
    new_list = [text[0]]
    for item in text[1:]:
        # make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links
        if (']]' in item) and ((not '[[' in item) or item.find(']]') < item.find('[[')):
            new_list[-1] = new_list[-1] +'|' + item
        else:
            new_list.append(item)
    new_list[-1] = new_list[-1][:-2] #trim off the closing brackets
    data_dict = {}
    for item in new_list:
        if '=' in item:
            items = item.split('=', 1)
            data_dict[items[0].strip()] = items[1].strip()
        else:
            continue
    return data_dict
    
def infobox_parse(article):
    """article: wikipedia.Page object"""
    while article.isRedirectPage():
        article = article.getRedirectTarget()
    article_text = article.get()
    return parse_infobox_text(get_infobox_from_text(article_text))

Simpler and probably more robust appoach to infobox parsing, using wikipedia.Page.templatesWithParams(). Depends on pywikipediabot.

#!/usr/bin/python

import wikipedia

def parse_infoboxes(page, *template_titles):
    """
    Returns a list of parsed templates that have the titles given, or all
    starting with "Infobox" if not given.

    page: wikipedia.Page object
    """
    templates = []
    res = []
    if template_titles:
        for title in template_titles:
            templates = [template for template in page.templatesWithParams()
                    if template[0] in template_titles]
    else:
        templates = [template for template in page.templatesWithParams()
                if template[0].startswith("Infobox")]
    for template in templates:
        template_dict = {}
        for param in template[1]:
            if "=" in param:
                split_param = param.split("=", 1)
                template_dict[split_param[0].strip()] = split_param[1].strip()
        res.append(template_dict)
    return res

chart_references.py

Script for this request.

#!/usr/bin/python

import wikipedia
import bs4
import catlib

def main():
    site = wikipedia.getSite()
    cat = catlib.Category(
            site, "Category:Singlechart making named ref").articles()
    res = []
    for page in cat:
#        print page
        if has_ref_conflict(page):
#            print "found"
            res.append(page)
    return res

def has_ref_conflict(page):
    single_refnames = set()
    for tem in page.templatesWithParams():
        if tem[0].lower() == "singlechart":
            for param in tem[1]:
                if param.startswith("refname"):
                    single_refnames.add(param[param.find("=") + 1:].strip('"'))
                    break
    refnames = set()
    ref_tags = bs4.BeautifulSoup(page.get()).find_all("ref")
    for tag in ref_tags:
        if tag.has_attr("name") and tag.contents and not tag.is_empty_element:
            refnames.add(tag.attrs["name"])
    return refnames & single_refnames

merge_template.py

#!/usr/bin/python

import wikipedia
import catlib

def main(sim=True):
    site = wikipedia.getSite()
    wikipedia.simulate = sim
    wikipedia.verbose = 1
    cat = catlib.Category(
            site, "Category:All articles to be merged").articles()
    res = []
    for page in cat:
        print page
        if page.namespace():
            print "namespace: ", page.title()
            continue
        for tem in page.templatesWithParams():
            if tem[0].lower().startswith("merge"): 
                merge_targets = []
                for param in tem[1]:
                    if "=" not in param:
                        merge_targets.append(wikipedia.Page(site, param))
                    else:
                        break
                break
        else:
            continue
        for target_page in merge_targets:
           if not [target_tem 
                   for target_tem in target_page.templatesWithParams()
                   if target_tem[0].lower().startswith("merge")]:
                new_text = u"{{"
                if tem[0].lower() == "merge to":
                    new_text += u"Merge From"
                elif tem[0].lower() == "merge":
                    new_text += u"Merge"
                else:
                    new_text += u"Merge to"
                new_text += u"|" + page.title() + u"}}\n\n"
                new_text += target_page.get()
                if raw_input(
                        new_text.encode("utf-8") + "\n\n"
                        + "Edit " + target_page.title().encode("utf-8") + " ?"
                        ) == "y":
                    target_page.put(new_text, comment=u"Add merge template")