Wikiproyecto:Bots/Repositorio/artículos-redirecciones.py
Herramientas
Acciones
General
Imprimir/exportar
En otros proyectos
Apariencia
De Wikipedia, la enciclopedia libre
actualizar · discusión · código desprotegido
Información de fichero
|
- Detalles:
# -*- coding: utf-8 -*-
# Copyright (C) 2009 emijrp
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Create redirects without diacrits to articles or redirects with diacritics
import argparse, codecs, re, time
from datetime import datetime
import os, sys
sys.path.append(os.path.split(os.getcwd())[0])
from wikipedia import Page, Site, output as display, stopme
import pagegenerators as pg, query as api
pairs={
u"àáâäãăǎąåā": "a", u'æǣ': "ae",
u'ḃɓ': "b",
u'çćčćĉċ': "c",
u'đḍďḋð': "d",
u'èéêëẽēę': "e",
u'ḟƒ': "f",
u'ĝġģğ': "g",
u'ĥħ': "h",
u'ìíîïīį': "i", u'ij': "ij",
u'ĵ': "j",
u'ķ': "k",
u'ŀļḷḹľł': "l",
u'ñńň': "n",
u'òóôöõøōǫ': "o",
u'œ': "oe",
u'ṗ': "p",
u'ŗřṛṝ': "r",
u'şṡšŝ': "s", u'ß': "sz",
u'ţṫṭ': "t",
u'Þ': "th",
u'ùúûüŭūų': "u",
u'ẁŵẅƿ': "w",
u'ýỳŷÿȳỹ': "y",
u'źžż': "z"
}
diacritics = "".join(pairs.keys())
def simplify_chars(string):
word=""
for ch in unicode(string):
is_upper = ch != ch.lower()
if ch.lower() in diacritics:
for keys in pairs:
if ch.lower() in keys:
ch = pairs[keys]
break
if is_upper: ch=ch.upper()
word += ch
word=word.replace(u"l·l","ll")
#word = re.sub("\W","!", word)
return word
def timedelta(td):
#get the timedelta obejct and returns also hours, minutes and seconds
#by accessing to .seconds atribute.
td = datetime.now()-datetime.fromtimestamp(td)
hours, remainder = divmod(td.seconds, 3600)
minutes, seconds = divmod(remainder, 60)
result = "%s%s%s%s" %(
"%i d" % td.days if td.days else "",
" %i h" % hours if hours else "",
" %i m" % minutes if minutes else "",
" %i s" % seconds if seconds else "",
)
if not result: result ="0 s %s ms" % str(td.microseconds).rstip("0")
return result.strip(), td.days, hours, minutes, seconds
def get_filename(filename="wikipage"):
user = sys.path[0].split("/")[2]
if not args.path:
path = "/home/%(u)s/temp/" % {"u": user}
else:
path = args.path
if path.startswith("*"):
path = path.replace("*/", "%s/" % os.getcwd())
if not path.endswith("/"):
path = "%s/" % path
return "%(p)s%(l)s%(f)s.log" % {"l":args.lang, "p": path, "f": filename}
def get_sql(query, filename="wikipage"):
fdata = {"l": args.lang, "p": path, "q": query, "f": filename}
os.system(
"""mysql -h %(l)swiki-p.db.toolserver.org -e"""
""" "use %(l)swiki_p;%(q)s" """
"""> %(p)s%(l)s%(f)s.log""" % fdata
)
f=codecs.open(get_filename(filename), 'r', encoding="utf-8")
lines = f.readlines()
f.close()
return lines
def load_from_cache():
f = codecs.open(get_filename(), 'r', encoding="utf-8")
lines = f.readlines()
f.close()
debug('Cargando paginas de %swiki' % args.lang)
pages=set()
for line in lines[1:]:
#saltamos la primera linea q es el describe de sql
pages.add(line[:-1].strip().replace("_"," "))
debug(
'Cargadas %i paginas de un total de %i [de %swiki]' % (
len(pages), len(lines)-1, args.lang
)
)
return pages
def load_from_toolserver():
#this function is only available from toolserver, elsewhere you must use
#the function load_from_pywikilib
#pages
sql = (
u"""mysql -h %(l)swiki-p.db.toolserver.org -e """
u""" "USE %(l)swiki_p;SELECT page_title FROM page WHERE page_title>='%(s)s' """
u"""AND page_title<'%(t)s' AND page_namespace=0" """
u"""> %(f)s""" % {
"l": args.lang,
"s": unicode(args.begin),
"t": unicode(args.end),
"f": get_filename()
}
)
os.system(sql.encode("utf8"))
debug(sql)
f = codecs.open(get_filename(), 'r', encoding="utf-8")
lines = f.readlines()
f.close()
debug('Cargando paginas de %swiki' % args.lang)
pages=set()
for line in lines[1:]:
#saltamos la primera linea q es el describe de sql
pages.add(line[:-1].strip().replace("_"," "))
debug(
'Cargadas %i paginas de un total de %i [de %swiki]' % (
len(pages), len(lines)-1, args.lang
)
)
return pages
def load_from_pywikilib():
gen = pg.AllpagesPageGenerator(
start=args.begin, includeredirects=False, site=Site(args.lang,"wikipedia")
)
pages = set()
debug('Cargando paginas de %swiki' % args.lang)
for page in gen:
if page.title() == args.end: break
pages.add(page.title())
debug('Cargadas %i paginas [de %swiki]' % (len(pages), args.lang))
return pages
def load_using_API():
pages = set()
debug('Cargando paginas de %swiki' % args.lang)
params = {
"action": "query",
"list": "allpages",
"apfrom": args.begin,
"apto": args.end,
"apnamescpace": 0,
"apfilterredir": "nonredirects",
"aplimit": "max"
}
next=True
while next:
data = api.GetData(params, Site(args.lang, "wikipedia"))
next = data.has_key("query-continue") and data['query-continue']['allpages'].has_key('apcontinue')
for page in data['query']['allpages']:
pages.add(page['title'])
if next:
params['apcontinue'] = data['query-continue']['allpages']['apcontinue']
debug('Cargadas %i paginas [de %swiki]' % (len(pages), args.lang))
return pages
def filter_pages(titles):
filter=set()
e=0
for title in titles:
if re.search(ur"[a-z%s0-9\-.,: ]" % diacritics, title, re.I):
ntitle = simplify_chars(title)
if title != ntitle and ntitle not in titles:
filter.add(ntitle)
if len(filter) % 100 == 0:
debug(str(len(filter)))
#debug(ur"[[%s]] -> [[%s]]" % (page2, page))
page = Page(Site(args.lang, 'wikipedia'), title)
npage = Page(Site(args.lang, 'wikipedia'), ntitle)
if not npage.exists():
if page.isRedirectPage():
output = u"#REDIRECT [[%s]]" % page.getRedirectTarget().title()
else:
output = u"#REDIRECT [[%s]]" % title
debug(output)
if args.edit and not args.test:
e+=1
npage.put(output, u"BOT - %s" % output)
debug("Se han realizado %i ediciones de %i disponibles sobre %i paginas cargadas." % (e, len(filter), len(titles)))
def debug(string):
if args.test or not args.quiet: display(string)
def main():
t=time.time()
debug(u"[\3{lightyellow}%s\3{default}] Empezamos." % time.strftime("%H:%M:%S"))
try:
if args.cache:
if os.path.exists(get_filename()):
pages = load_from_cache()
else:
debug("El fichero temporal no existe, iniciando la consulta SQL...")
pages = load_from_toolserver()
elif args.piwikimedia:
pages = load_from_pywikilib()
elif args.use_api:
pages = load_using_API()
else:
pages = load_from_toolserver()
except KeyboardInterrupt:
debug("Cancelled by user...")
debug(u"[\3{lightpurple}%s\3{default}] OK. Se ha tardado: %s." % (time.strftime("%H:%M:%S"), timedelta(t)[0]))
filter_pages(pages)
if args.remove:
os.system("rm %s" % get_filename())
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Crea redirecciones sin acentuación de artcículos que contengan diacríticas en su título.",
usage="%(prog)s [--lang <lang>] [--begin <A>] [--end <M>] [--path </home/emijrp/temporal/>] [--api|--cache|--pgen] [--remove]"
)
parser.add_argument("--lang", "-l", default="es", help="Idioma del proyecto. (Opcional, por defecto: '%(default)s'.)", metavar="es")
parser.add_argument("--begin", "-b", default="!", type=unicode, help="Primer artículo", metavar="!")
parser.add_argument("--end", "-e", default=u"ÿ", type=unicode, help="Último artículo", metavar="ÿ")
parser.add_argument("--pgen", "-g", dest="piwikimedia", action="store_true", default=False, help="usar método de pagegenerator, no recomendable, es el más lento y el que más recursos consume.")
parser.add_argument("--api", "-a", dest="use_api", action="store_true", default=False, help="usar API, recomendable si no se dispone de acceso al toolserver.")
parser.add_argument("--cache", "-C", action="store_true", default=False, help="usar caché (ficheros temporales, solo para toolserver)")
parser.add_argument("--edit", "-E", action="store_true", default=False, help="editar, imprescindible para que el bot realice los cambios")
parser.add_argument("--remove", "-R", action="store_true", default=False, help="eliminar archivos temporales (solo para toolserver))")
parser.add_argument("--path", "-H", default=None, help="ruta fichero (solo para toolserver; por defecto: /home/{USER}/temp/)", metavar="/home/{USER}/temp/")
parser.add_argument("--quiet", "-Q", action="store_true", default=False, help="anula la información adicional durante del desarrollo del programa.")
parser.add_argument("--test", "-T", action="store_true", default=False, help="activar modo pruebas (no permite editar y muestra toda la información adicional.)")
args = parser.parse_args()
try:
main()
except KeyboardInterrupt:
display("Cancelled by user...")
finally:
stopme()