Apr 022018 Tagged with , , , 0 Responses

Content Grabber – Useful Custom Scripts

Content Grabber has powerful custom scripting using which you can customize Content Grabber behavior and develop power full web scraping agent that can crawl and scrape data from simple to very complex websites.

Below are few example of custom script using C# which shows database connection and dynamically run time Xpath modification when scraper running. 

Initialization Script

This script initialize the Global variable counter using args.GlobalData

using System;

using Sequentum.ContentGrabber.Api;

public class Script

{        

 public static bool InitializeAgent(AgentInitializationArguments args)

 {

         args.GlobalData.AddOrUpdateData("counter", 1);

         return true;

 }

}

 

Database Connection and Insert Script Example

This script can be used to Insert data to MySQL database.

using System;
using System.IO;
using System.Data.SqlClient;
using Sequentum.ContentGrabber.Api;
using Sequentum.ContentGrabber.Commands;
using MySql.Data.MySqlClient;
using System.Linq;

public class Script
{
	public static bool ExportData(DataExportArguments args)
	{
		try
		{
	
			var sqlConnectionString = new MySqlConnectionStringBuilder();
			var idGuid= Guid.NewGuid();
			
			
			sqlConnectionString.Server = "localhost";
			sqlConnectionString.UserID = "root";
			sqlConnectionString.Password = "";
			sqlConnectionString.Database = "content grabber";
			
		
			
			
			
			
			using (var sqlConnection = new MySqlConnection(sqlConnectionString.ConnectionString))
			{
				sqlConnection.Open();
				
				using (IExportReader dataReader = args.Data.GetTable()) 
				{
					while (dataReader.Read()) 
					{	
						using (var sqlCommand = new MySqlCommand())
						{
							sqlCommand.Connection = sqlConnection;
							sqlCommand.CommandText = @"INSERT INTO `godaddy` (
							`Godaddy ID`,`GetTime`,`domain`,`EstimatedValue` )VALUES (@idGuid,@GetTime,@domain,@EstimatedValue)
ON DUPLICATE KEY UPDATE `Godaddy ID` = @idGuid";
							sqlCommand.Parameters.AddWithValue("@idGuid", idGuid);
							sqlCommand.Parameters.AddWithValue("@GetTime", dataReader.GetStringValue("GetTime"));
							sqlCommand.Parameters.AddWithValue("@domain", dataReader.GetStringValue("domain"));
							sqlCommand.Parameters.AddWithValue("@EstimatedValue", dataReader.GetStringValue("EstimatedValue"));
							
							sqlCommand.ExecuteNonQuery();
						}
					}
				}
			}
		}
		catch (Exception exp)
		{
			args.WriteDebug(exp.ToString(), DebugMessageType.Error);
			return false;
		}
		return true;
	}
}

 

Modify Xpath at Runtime

This can be used to dynamically change Xpath of Click Action, Link Navigation and other Commands that need Xpath to Interact with Browser.

using System;
using Sequentum.ContentGrabber.Api;
using Sequentum.ContentGrabber.Commands;
public class Script
{
	//See help for a definition of CustomScriptArguments.
	public static bool TransformCommand(CommandTransfomationScriptArguments args)
	{
		//Modify args.Command here.
			
		ISelection selection = args.Command as ISelection;
		selection.Selection.SelectionPaths[0].Xpath ="//a[contains(@href,'javascript:')][text()='"+int.Parse(args.DataRow.GetDataValue("ID"))+1+"']";
	    		
		return true;
	}
}

 

Downloading Full HTML page on Disk

Using this script full HTML page can we downloaded on local disk so that can be used to parse data again if something missed while scraping instead of making website request again and consuming their bandwidth.  This is ideal work flow practice for Web Scraping to avoid any problem if realized at the end that something was missed during data crawling.

using System;
using System.IO;
using Sequentum.ContentGrabber.Api;
public class Script
{
 //See help for a definition of ContentTransformationArguments.
 public static string TransformContent(ContentTransformationArguments args)
 {
  //Place your transformation code here.
  //This example just returns the input data
  
  String fname = args.DataRow.RowId.ToString();
  DownloadHTML(args.Content,fname);
  return fname+".html";
 }
 
 public static void DownloadHTML(String html,String file_name)
 {
  System.IO.File.WriteAllText("E:\\html\\site_name\\"+file_name+".html",html,System.Text.Encoding.UTF8);
 }
 
}

 

Leave a Reply

Your email address will not be published. Please enter your name, email and a comment.

You may use these HTML tags and attributes: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <s> <strike> <strong>