August 30, 2014
 
 
RSSRSS feed

Python Squeezes the Web - page 2

Introduction

  • October 23, 1999
  • By Stephen Pitts

Here's a sample set of data from the USCF's web site that we need to parse with Python



From http://www.64.com/cgi-bin/ratings.pl?nm=T&st=TN:

12-97  373p        TN 03-97 <AHREF=/cgi-bin/ratings.pl/USCF/21005567>TARVER,NATHAN</A>

12-94  401p        TN 02-95 <A HREF=/cgi-bin/ratings.pl/USCF/12613391>TASHIE,DAPHNE</A>

10-99  385         TN 11-99 <A HREF=/cgi-bin/ratings.pl/USCF/12752592>TATE,JEREMY</A>           10-05  367



From http://www.64.com/cgi-bin/ratings.pl?nm=P&st=MS:

12-96 1167p        MS 04-97 <A HREF=/cgi-bin/ratings.pl/USCF/12660161>PATTERSON,RAPHAEL C</A>

08-99 1452  1261   MS 09-00 <A HREF=/cgi-bin/ratings.pl/USCF/12499243>PATTILLO,BILLY R</A>      09-22 1476  1261

12-94  960p        MS 04-95 <A HREF=/cgi-bin/ratings.pl/USCF/12619152>PATT

ON,SAM R</A>

08-99  863         MS 04-00 <A HREF=/cgi-bin/ratings.pl/USCF/12739657>PAYNE,DANIEL</A>



The logic for the program is quite simple:

  • Retrieve a list of the USCF ID #s of players in our organization
  • Retrieve a list of state/letter combinations that we need to fetch
  • Fetch each page
  • Go through the pages line by line. If a newer rating exists to the far right, use it; otherwise use the existing rating.
  • Write the changes to the database, where the PHP3 scripts on the web server will automatically pick up on the changes

And the Python program that does the dirty work (in under 100 lines of code!):

#! /usr/bin/python

# A Python daemon that checks the uschess.org ratings against a

# mysql database and updates them accordingly.



import urllib

import re

import string

import calendar

import MySQL

# called on a per-state basis

def ProcessUSCFInfo(letter, state):

        "Processes the information from the USCF Web site and imports it into the Mysql database."



        print "Downloading data for state", state, "letter", letter

        uscf_data = urllib.urlopen("http://www.64.com/cgi-bin/ratings.pl?st=" +state + "&nm=" + letter).read()



        # the data that we want is inside the <pre> tag, but after the <b>-enclosed title

        # get a list of players

        beginMatch = beginDataRegexp.search(uscf_data)

        endMatch = endDataRegexp.search(uscf_data)

        uscf_data = uscf_data[beginMatch.end():endMatch.start()]

        uscf_player_lines = string.split(uscf_data, "\n")



        # parse the lines and fill up a list with USCFPlayer instances

        uscf_players = []

        playercount = 0

        for data_line in uscf_player_lines:

                # Use a regexp to extract the needed information from a line of

data

                this_player = USCFPlayer()

                regexp_match = perLineRegexp.match(data_line)

                if regexp_match == None: continue

                this_player.USCFRating, exp_month, exp_year, this_player.PlayerId, w_rating = regexp_match.groups()



                # make sure that this player is in the database

                if this_player.PlayerId not in player_id_list: continue



                # the weekly updates

                if(w_rating != None): this_player.USCFRating = w_rating



                # handle Life memberships and the Y2K issue in the expiration dates

                if exp_month == None:

                        this_player.ExpDate = "2099/12/31"

                else:

                else:

                        if string.atoi(exp_year) > 70: exp_year = "19" + exp_year

                        else: exp_year = "20" + exp_year



                        # get the last day of the month

                        exp_day = (calendar.monthrange(string.atoi(exp_year), string.atoi(exp_month)))[1]

                        this_player.ExpDate = exp_year+"/"+exp_month+"/"+str(exp_day)



                # add the USCFPlayer to the list

                USCFPlayers.append(this_player)

                playercount = playercount + 1

        print "Retrieved", playercount, "players from state", state, "letter", letter

        return uscf_players



# used to hold data related to a USCF Player

class USCFPlayer:

        pass



# common regexps used by ProcessUSCFInfo

beginDataRegexp = re.compile(r"<pre>\n<b>.*</b>", re.I | re.DO

TALL)

endDataRegexp = re.compile(r"</pre>")

perLineRegexp =re.compile(r".{5}\s+(\d{3,4})p?\s+(?:\d{3,4}p?)?\s*\w{2}\s+(?:(?:(\d{2})-(\d{2}))|Life)\s+<A.*USCF/(\d{8}).*/A>(?:\s+.{5}\s+(\d{3,4})\s*(?:\d{3,4}p?)?)?")

# global list of all players

USCFPlayers = []



# get a list with all of the valid playerids

db_conn = MySQL.connect("host_name_here", "user_id", "pass_word")

db_conn.selectdb("database_name")

player_id_list_tmp = db_conn.do("SELECT PlayerId FROM Players")

player_id_list = []



# eliminate all of the singletons

for pid_singleton in player_id_list_tmp:

        player_id_list.append(pid_singleton[0])



# get a list of the state/letter combinations

state_letter_list = db_conn.do("SELECT LEFT(LastName, 1) AS Letter, State, CONCAT(LEFT(LastName, 1), State) AS Sorter FROM Players GROUP BY Sorter")



# iterate and process each state/letter combo

for state_letter in state_letter_list:

        ProcessUSCFInfo(state_letter[0], state_letter[1])

# dump the whole mess to the database

print "Trying to save", len(USCFPlayers), "players to database...",

updated_players = 0

for uscf_player in USCFPlayers:

        db_conn.do("UPDATE Players SET USCFRating = " + uscf_player.USCFRating + ", ExpDate = '" + uscf_player.ExpDate  + "' WHERE PlayerId = '" + uscf_player.PlayerId + "'")



print "done"

Sitemap | Contact Us