Ask Ben: Screen Scraping PowerBall Statistics With ColdFusion

<!---
	Determine the PowerBall URL that we want to scrape.
	NOTE: I am building this rather than just putting it
	in the CFHttp tag for text-wrapping reasons.
--->
<cfset strURL = (
	"http://www.powerball.com/powerball/pb_frequency.asp?" &
	"matrixID=2&graph=0"
	) />
 
 
<!--- Grab the PowerBall HTML page. --->
<cfhttp
	url="#strURL#"
	method="get"
	useragent="#CGI.http_user_agent#"
	result="objGet"
	/>
 
 
<!---
	We want to grab all the tables out of the page.
	One of them should be the one that we are looking for.
	Realize, however, that we are doing this based on what
	the HTML of the target page looked like at the time of
	this writing. One of the dangerous parts of screen
	scraping is that the target page may change at any time
	without any warning, rendering this algorithm totally
	useless.
--->
<cfset objPattern = CreateObject(
	"java",
	"java.util.regex.Pattern"
	).Compile(
		JavaCast(
			"string",
			"(?i)<table[^>]*>([\w\W](?!<table))+?</table>"
			)
		) />
 
 
<!---
	Create the pattern matcher based on our regular
	expression pattern and the FileContent that we got
	from our CFHttp call.
--->
<cfset objMatcher = objPattern.Matcher(
	JavaCast( "string", objGet.FileContent )
	) />
 
 
<!---
	Set default table code. This variable will hopefully
	contain our target HTML table in a little bit.
--->
<cfset strTableHtml = "" />
 
 
<!--- Find the proper table. --->
<cfloop condition="objMatcher.Find()">
 
	<!---
		Check to see if we have the right table. This
		one will contain the header values taht we are
		searching for.
 
		NOTE: Just a reminder that this may change at any
		point and only holds true at the time that this
		blog entry was written.
	--->
	<cfif (
		Find( "Ball Number", objMatcher.Group() ) AND
		Find( "White Balls", objMatcher.Group() ) AND
		Find( "Powerball", objMatcher.Group() ) AND
		Find( "Power Play", objMatcher.Group() )
		)>
 
		<!--- Extract table code. --->
		<cfset strTableHtml = objMatcher.Group() />
 
	</cfif>
 
</cfloop>
 
 
<!--- Check to see if we have found the HTML table. --->
<cfif Len( strTableHtml )>
 
	<!---
		Now that we have the table HTML, we want to
		strip out most of the data so that we can treat
		it as XHTML. To do this, we are going to strip out
		everything but the table related tags.
	--->
	<cfset strTableHtml = strTableHtml.ReplaceAll(
		JavaCast( "string", "(?i)<(?!/?(ta|tr|th|td))[^>]*>" ),
		JavaCast( "string", " " )
		) />
 
 
	<!---
		Now, strip out all tag attributes. This just helps
		to make sure that the resultant HTML is more likely
		to stick to an XML standard.
	--->
	<cfset strTableHtml = strTableHtml.ReplaceAll(
		JavaCast( "string", "(?i)<(/?\w+)[^>]*>" ),
		JavaCast( "string", "<$1>" )
		) />
 
 
	<!---
		ASSERT: At this point, the XHTML that we have
		remaining should function as a basic XML document,
		which we can query off of.
	--->
 
 
	<!---
		Now, let's build a ColdFusion query that will hold
		the values of the HTML.
	--->
	<cfset qPowerBall = QueryNew(
		"ball_number, white_ball_fr, powerball_fr, power_play_fr",
		"varchar, varchar, varchar, varchar"
		) />
 
 
	<!---
		Get all TR elements that are in the 3rd or greater
		position (we have no need to grab the headers or
		the buffer row that exists between the header and
		the body rows).
	--->
	<cfset arrTR = XmlSearch(
		Trim( strTableHtml ),
		"//tr[ position() > 2 ]"
		) />
 
 
	<!--- Loop over the TR nodes. --->
	<cfloop
		index="intTRIndex"
		from="1"
		to="#ArrayLen( arrTR )#"
		step="1">
 
		<!---
			Get a short hand reference for the current
			TR node.
		--->
		<cfset xmlTR = arrTR[ intTRIndex ] />
 
 
		<!---
			For each row, we are going to want to add a record
			the PowerBall query we are building.
		--->
		<cfset QueryAddRow( qPowerBall ) />
 
 
		<!---
			For each row, we might have anywhere from zero to
			four columns. We won't know until we query for the
			TD text. When querying for text, we want to make
			sure to strip out the white space which we may have
			created when cleaning the HTML. Normally, we would
			use normalize-space() to trim the text() value, but
			CFMX7 does not support this yet.
		--->
		<cfset arrTD = XmlSearch(
			xmlTR,
			"td/text()"
			) />
 
 
		<!--- Loop over the TD values. --->
		<cfloop
			index="intTDIndex"
			from="1"
			to="#ArrayLen( arrTD )#"
			step="1">
 
			<!---
				The column to which we add this value will
				depend on the index of the current TD.
			--->
			<cfswitch expression="#intTDIndex#">
				<cfcase value="1">
					<cfset strColumn = "ball_number" />
				</cfcase>
				<cfcase value="2">
					<cfset strColumn = "white_ball_fr" />
				</cfcase>
				<cfcase value="3">
					<cfset strColumn = "powerball_fr" />
				</cfcase>
				<cfcase value="4">
					<cfset strColumn = "power_play_fr" />
				</cfcase>
			</cfswitch>
 
 
			<!---
				Set the column value. Be sure to cast to a Java
				String, just to make sure we don't corrupt our
				ColdFusion query.
			--->
			<cfset qPowerBall[ strColumn ][ qPowerBall.RecordCount ] = JavaCast( "string", Trim( arrTD[ intTDIndex ].XmlValue ) ) />
 
		</cfloop>
 
	</cfloop>
 
 
	<!--- Dump out the resultant query. --->
	<cfdump
		var="#qPowerBall#"
		label="PowerBall Statistics"
		/>
 
</cfif>

For Cut-and-Paste