<!--- Determine the PowerBall URL that we want to scrape. NOTE: I am building this rather than just putting it in the CFHttp tag for text-wrapping reasons. ---> <cfset strURL = ( "http://www.powerball.com/powerball/pb_frequency.asp?" & "matrixID=2&graph=0" ) /> <!--- Grab the PowerBall HTML page. ---> <cfhttp url="#strURL#" method="get" useragent="#CGI.http_user_agent#" result="objGet" /> <!--- We want to grab all the tables out of the page. One of them should be the one that we are looking for. Realize, however, that we are doing this based on what the HTML of the target page looked like at the time of this writing. One of the dangerous parts of screen scraping is that the target page may change at any time without any warning, rendering this algorithm totally useless. ---> <cfset objPattern = CreateObject( "java", "java.util.regex.Pattern" ).Compile( JavaCast( "string", "(?i)<table[^>]*>([\w\W](?!<table))+?</table>" ) ) /> <!--- Create the pattern matcher based on our regular expression pattern and the FileContent that we got from our CFHttp call. ---> <cfset objMatcher = objPattern.Matcher( JavaCast( "string", objGet.FileContent ) ) /> <!--- Set default table code. This variable will hopefully contain our target HTML table in a little bit. ---> <cfset strTableHtml = "" /> <!--- Find the proper table. ---> <cfloop condition="objMatcher.Find()"> <!--- Check to see if we have the right table. This one will contain the header values taht we are searching for. NOTE: Just a reminder that this may change at any point and only holds true at the time that this blog entry was written. ---> <cfif ( Find( "Ball Number", objMatcher.Group() ) AND Find( "White Balls", objMatcher.Group() ) AND Find( "Powerball", objMatcher.Group() ) AND Find( "Power Play", objMatcher.Group() ) )> <!--- Extract table code. ---> <cfset strTableHtml = objMatcher.Group() /> </cfif> </cfloop> <!--- Check to see if we have found the HTML table. ---> <cfif Len( strTableHtml )> <!--- Now that we have the table HTML, we want to strip out most of the data so that we can treat it as XHTML. To do this, we are going to strip out everything but the table related tags. ---> <cfset strTableHtml = strTableHtml.ReplaceAll( JavaCast( "string", "(?i)<(?!/?(ta|tr|th|td))[^>]*>" ), JavaCast( "string", " " ) ) /> <!--- Now, strip out all tag attributes. This just helps to make sure that the resultant HTML is more likely to stick to an XML standard. ---> <cfset strTableHtml = strTableHtml.ReplaceAll( JavaCast( "string", "(?i)<(/?\w+)[^>]*>" ), JavaCast( "string", "<$1>" ) ) /> <!--- ASSERT: At this point, the XHTML that we have remaining should function as a basic XML document, which we can query off of. ---> <!--- Now, let's build a ColdFusion query that will hold the values of the HTML. ---> <cfset qPowerBall = QueryNew( "ball_number, white_ball_fr, powerball_fr, power_play_fr", "varchar, varchar, varchar, varchar" ) /> <!--- Get all TR elements that are in the 3rd or greater position (we have no need to grab the headers or the buffer row that exists between the header and the body rows). ---> <cfset arrTR = XmlSearch( Trim( strTableHtml ), "//tr[ position() > 2 ]" ) /> <!--- Loop over the TR nodes. ---> <cfloop index="intTRIndex" from="1" to="#ArrayLen( arrTR )#" step="1"> <!--- Get a short hand reference for the current TR node. ---> <cfset xmlTR = arrTR[ intTRIndex ] /> <!--- For each row, we are going to want to add a record the PowerBall query we are building. ---> <cfset QueryAddRow( qPowerBall ) /> <!--- For each row, we might have anywhere from zero to four columns. We won't know until we query for the TD text. When querying for text, we want to make sure to strip out the white space which we may have created when cleaning the HTML. Normally, we would use normalize-space() to trim the text() value, but CFMX7 does not support this yet. ---> <cfset arrTD = XmlSearch( xmlTR, "td/text()" ) /> <!--- Loop over the TD values. ---> <cfloop index="intTDIndex" from="1" to="#ArrayLen( arrTD )#" step="1"> <!--- The column to which we add this value will depend on the index of the current TD. ---> <cfswitch expression="#intTDIndex#"> <cfcase value="1"> <cfset strColumn = "ball_number" /> </cfcase> <cfcase value="2"> <cfset strColumn = "white_ball_fr" /> </cfcase> <cfcase value="3"> <cfset strColumn = "powerball_fr" /> </cfcase> <cfcase value="4"> <cfset strColumn = "power_play_fr" /> </cfcase> </cfswitch> <!--- Set the column value. Be sure to cast to a Java String, just to make sure we don't corrupt our ColdFusion query. ---> <cfset qPowerBall[ strColumn ][ qPowerBall.RecordCount ] = JavaCast( "string", Trim( arrTD[ intTDIndex ].XmlValue ) ) /> </cfloop> </cfloop> <!--- Dump out the resultant query. ---> <cfdump var="#qPowerBall#" label="PowerBall Statistics" /> </cfif>