C#: Parse a website and save specific content as XML.

In this tutorial I will show you how to iterate through a website using a WebClient,  Save each page’s content into a string, parse it using regular expressions, and save it as XML.

I will be parsing airplane data from airliners.net

Here is a picture of what the website looks like.

Get started with a .NET C# Console application and create a WebClient.

Use the URL and Output the html returned.

WebClient wc = new WebClient();
string htmlString = wc.DownloadString("http://www.airliners.net/aircraft-data/stats.main?id=2");
Console.WriteLine(htmlString);

Lets take a look at both  the console output and the source of the page to make sure we got the right html back.

Console and Page Source

Success, we found the Airplane title!  Now lets see if we can parse it, and only it, from the entire string of html.

Try to see if there is a pattern that’s unique to what you need.

WebClient wc = new WebClient();
string htmlString = wc.DownloadString("http://www.airliners.net/aircraft-data/stats.main?id=2");
Match mTitle = Regex.Match(htmlString, @"<center><h1>(.*?)</h1>");
if (mTitle.Success)
{
    string airplaneTitle = mTitle.Groups[1].Value;
    Console.WriteLine(airplaneTitle);
}

Success.

Now lets get Country of Origin and the other categories.

string airplaneCountry = "";
Match mCountry = Regex.Match(htmlString, @"<b>Country of origin</font>(.*?)<p>", RegexOptions.Singleline);
if (mCountry.Success)
{
     airplaneCountry = mCountry.Groups[1].Value;
     Console.WriteLine(airplaneCountry);
}
Console.WriteLine("***************************************************************");
//manicure the pattern string
//Replace everything before the last (greater than) sign with and empty string
airplaneCountry = Regex.Replace(airplaneCountry, "(.*?)>", "").Trim();
Console.WriteLine(airplaneCountry);

Now I am tired of typing, so lets make a method for everything else.

static void Main(string[] args)
{
 
    WebClient wc = new WebClient();
    string htmlString = wc.DownloadString("http://www.airliners.net/aircraft-data/stats.main?id=2");
 
    Console.WriteLine("Name       :"+ parsePattern(@"<center><h1>(.*?)</h1>",htmlString));
    Console.WriteLine("Country    :" + parsePattern(@"<b>Country of origin</font>(.*?)<p>", htmlString));
    Console.WriteLine("Powerplants:" + parsePattern(@"<b>Powerplants</font>(.*?)<p>", htmlString));
    Console.WriteLine("Performance:" + parsePattern(@"<b>Performance</font>(.*?)<p>", htmlString));
    Console.WriteLine("Weights    :" + parsePattern(@"<b>Weights</font>(.*?)<p>", htmlString));
    Console.WriteLine("Dimentions :" + parsePattern(@"<b>Dimensions</font>(.*?)<p>", htmlString));
    Console.WriteLine("Capacity   :" + parsePattern(@"<b>Capacity</font>(.*?)<p>", htmlString));
    Console.WriteLine("Type       :" + parsePattern(@"<b>Type</font>(.*?)<p>", htmlString));
    Console.WriteLine("Production :" + parsePattern(@"<b>Production</font>(.*?)<p>", htmlString));
 
    //lol   :D   Damn this History!
    //Console.WriteLine(parsePattern(@"<b>History</font>(.*?)<table border=0 cellpadding=1 cellspacing=0 >", htmlString));
    //History breaks my method pattern cause there's too many <p>'s
    //Doing History Manually, Due to Different Pattern.
    Match mHistory = Regex.Match(htmlString, @"<b>History</font>(.*?)<table border=0 cellpadding=1 cellspacing=0 >", RegexOptions.Singleline);
    if (mHistory.Success)
    {
        string strContent = mHistory.Groups[1].Value;
        //Google your problems "C# regex to remove html" - thanks stackoverflow
        //Get @"<[^>]*>"
        strContent = Regex.Replace(strContent, @"<[^>]*>", "").Trim();
        Console.WriteLine("History : " + strContent);
    }
}
public static string parsePattern(string pat, string htmlString)
{
    Match mCategory = Regex.Match(htmlString, @pat, RegexOptions.Singleline);
    if (mCategory.Success)
    {
        string strContent = mCategory.Groups[1].Value;
        if (strContent.Contains('>'))
        {
            strContent = Regex.Replace(strContent, "(.*?)>", "").Trim();
            return strContent;
        }
        else
            return strContent;
    }
    return "";
}

Now lets save everything into xml :D
I Created a Folder in my C: Drive called C:\Airplanes\
Iterate through each page and save the content.

Here’s the full Code to do the Whole Website:

static void Main(string[] args)
{
    //Since we don't want to hold a glock up to the webserver's dome piece:
    //Let's pretend to be a human who goes to each page within 1 - 3 seconds randomly.
    WebClient wc = new WebClient();
    int pageCount = 1;
    int randomWait = 0;
    Random random = new Random();
    while (true)
    {
        randomWait = random.Next(1000, 3000);
        Thread.Sleep(randomWait);
        string htmlString = wc.DownloadString("http://www.airliners.net/aircraft-data/stats.main?id=" + pageCount);
        ReadAndAppend(htmlString);
        pageCount++;
        Console.WriteLine("Saving :" + pageCount);
    }
}
public static void ReadAndAppend(string htmlString)
{
    string name = parsePattern(@"<center><h1>(.*?)</h1>", htmlString);
    string country = parsePattern(@"<b>Country of origin</font>(.*?)<p>", htmlString);
    string power = parsePattern(@"<b>Powerplants</font>(.*?)<p>", htmlString);
    string perf = parsePattern(@"<b>Performance</font>(.*?)<p>", htmlString);
    string lb = parsePattern(@"<b>Weights</font>(.*?)<p>", htmlString);
    string dim = parsePattern(@"<b>Dimensions</font>(.*?)<p>", htmlString);
    string cap = parsePattern(@"<b>Capacity</font>(.*?)<p>", htmlString);
    string type = parsePattern(@"<b>Type</font>(.*?)<p>", htmlString);
    string prod = parsePattern(@"<b>Production</font>(.*?)<p>", htmlString);
    string hist = "";
    Match mHistory = Regex.Match(htmlString, @"<b>History</font>(.*?)<table border=0 cellpadding=1 cellspacing=0 >", RegexOptions.Singleline);
    if (mHistory.Success)
    {
        string strContent = mHistory.Groups[1].Value;
        strContent = Regex.Replace(strContent, @"<[^>]*>", "").Trim();
        hist = strContent;
    }
    //make the xml
    string xmlString = "<plane>";
    xmlString += "  <name>" + name + "</name>\r\n";
    xmlString += "  <country>" + country + "</country>\r\n";
    xmlString += "  <power>" + power + "</power>\r\n";
    xmlString += "  <perf>" + perf + "</perf>\r\n";
    xmlString += "  <lb>" + lb + "</lb>\r\n";
    xmlString += "  <dim>" + dim + "</dim>\r\n";
    xmlString += "  <cap>" + cap + "</cap>\r\n";
    xmlString += "  <type>" + type + "</type>\r\n";
    xmlString += "  <prod>" + prod + "</prod>\r\n";
    xmlString += "  <hist>" + hist + "</hist>\r\n";
    xmlString += "</plane>\r\n\r\n";
 
    //Show me the saves and count
    Console.WriteLine(xmlString);
 
    //save to C:\Airplanes\
    StreamWriter streamWrite;
    streamWrite = File.AppendText("C:\\Airplanes\\airData.xml");
    streamWrite.WriteLine(xmlString);
    streamWrite.Close();
}
 
public static string parsePattern(string pat, string htmlString)
{
    Match mCategory = Regex.Match(htmlString, @pat, RegexOptions.Singleline);
    if (mCategory.Success)
    {
        string strContent = mCategory.Groups[1].Value;
        if (strContent.Contains('>'))
        {
            strContent = Regex.Replace(strContent, "(.*?)>", "").Trim();
            return strContent;
        }
        else
            return strContent;
    }
    return "";
}

Heres the XML file Created.

C#: How to get correlation coefficient of two arrays Correl()

This simple example shows you how to get the correlation coefficient of two arrays.  Microsoft Excel and OpenOffice Calc has a function for this called CORREL() :

//Two arrays
double[] array1 = { 3, 2, 4, 5, 6 };
double[] array2 = { 9, 7, 12, 15, 17 };
 
double[] array_xy = new double[array1.Length];
double[] array_xp2 = new double[array1.Length];
double[] array_yp2 = new double[array1.Length];
for (int i = 0; i &lt; array1.Length; i++)
    array_xy[i] = array1[i] * array2[i];
for (int i = 0; i &lt; array1.Length; i++)
    array_xp2[i] = Math.Pow(array1[i], 2.0);
for (int i = 0; i &lt; array1.Length; i++)
    array_yp2[i] = Math.Pow(array2[i], 2.0);
double sum_x = 0;
double sum_y = 0;
foreach (double n in array1)
    sum_x += n;
foreach (double n in array2)
    sum_y += n;
double sum_xy = 0;
foreach (double n in array_xy)
    sum_xy += n;
double sum_xpow2 = 0;
foreach (double n in array_xp2)
    sum_xpow2 += n;
double sum_ypow2 = 0;
foreach (double n in array_yp2)
    sum_ypow2 += n;
double Ex2 = Math.Pow(sum_x, 2.00);
double Ey2 = Math.Pow(sum_y, 2.00);
 
double Correl = 
(array1.Length * sum_xy - sum_x * sum_y) /
Math.Sqrt((array1.Length * sum_xpow2 - Ex2) * (array1.Length * sum_ypow2 - Ey2));
 
Console.WriteLine("CORREL : "+ Correl);

C#: Project Euler Solution to Problem 27

http://projecteuler.net/problem=27

Considering quadratics of the form:

n² + an + b, where |a| < 1000 and |b| < 1000 where |n| is the modulus/absolute value of n e.g. |11| = 11 and |-4| = 4 Find the product of the coefficients, a and b, for the quadratic expression that produces the maximum number of primes for consecutive values of n, starting with n = 0.

static void Main(string[] args)
{
    //n² + an + b
    int maxprimes = 0;
    int maxproduct = 0;
    for (int a = 0; a < 1000; a++)
    {
        for (int b = 0; b < 1000; b++)
        {
            int prime = countPrime(a, b);
            if (maxprimes < prime)
            {
                maxprimes = prime;
                maxproduct= (-a*b );
            }
        }
    }
    Console.WriteLine(maxproduct);
}
 
public static int countPrime(int a, int b)
{
    int count = 0;
    int n = 0;
    while (true)
    {
        double result = Math.Pow(n, 2.00) - (a * n) + b;
        if (isPrime((int)result) && result >= 0)
            count++;
        else
            return count;
        n++;
    }
}
 
public static bool isPrime(int n)
{
    if (n == 1)
        return false;
    if (n == 2)
        return true;
    for (int i = 2; i < n; ++i)
    {
        if ((n % i) == 0)
            return false;
    }
    return true;
}

C#: Project Euler Solution to Problem 32

http://projecteuler.net/problem=32

We shall say that an n-digit number is pandigital if it makes use of all the digits 1 to n exactly once; for example, the 5-digit number, 15234, is 1 through 5 pandigital.

The product 7254 is unusual, as the identity, 39 x 186 = 7254, containing multiplicand, multiplier, and product is 1 through 9 pandigital.

Find the sum of all products whose multiplicand/multiplier/product identity can be written as a 1 through 9 pandigital.

public static LinkedList<string> onetonine = new LinkedList<string>();
static void Main(string[] args)
{
    LinkedList<int> distinct_product = new LinkedList<int>();
    int product = 0;
    for (int a = 1; a < 2000; a++)
    {
        for (int b = 1; b < 2000; b++)
        {
            product = a * b;
 
            if (isDigits(a + "" + b + "" + product))
            {
                if (!distinct_product.Contains(product) )
                    distinct_product.AddLast(product);
            }
        }
    }
    int sum = 0;
    foreach (int prod in distinct_product)
        sum += prod;
    Console.WriteLine("sum : "+sum);
}
 
public static bool isDigits(string s)
{
    onetonine.Clear();
    onetonine.AddLast("1");
    onetonine.AddLast("2");
    onetonine.AddLast("3");
    onetonine.AddLast("4");
    onetonine.AddLast("5");
    onetonine.AddLast("6");
    onetonine.AddLast("7");
    onetonine.AddLast("8");
    onetonine.AddLast("9");
    if (s.Length != 9)
        return false;
    for (int i = 0; i < s.Length; i++)
    {
        if (onetonine.Contains("" + s[i]))
            onetonine.Remove("" + s[i]);
        else
            return false;
    }
    return true;
}

C#: Project Euler Solution to Problem 39

http://projecteuler.net/problem=39

If p is the perimeter of a right angle triangle with integral length sides, {a,b,c}, there are exactly three solutions for p = 120.

{20,48,52}, {24,45,51}, {30,40,50}

For which value of p <= 1000, is the number of solutions maximised?

int Max_P = 0;
int P_position = 0;
for (int p = 1; p < 1000; p++)
{
    int count_p = 0;
    for (int a = 1; a < 500; a++)
    {
        for (int b = 1; b < 500; b++)
        {
            for (int c = 1; c < 500; c++)
            {
                if (  (a + b + c == p) &&
                (Math.Pow(a, 2.00) + Math.Pow(b, 2.00) == Math.Pow(c, 2.00)) )
                    count_p++;
            }
        }
    }
    if (Max_P < count_p)
    {
        Max_P = count_p;
        P_position = p;
    }
}
Console.WriteLine("p = "+P_position);

C#: Project Euler Solution to Problem 47

http://projecteuler.net/problem=47

The first two consecutive numbers to have two distinct prime factors are:

14 = 2 x 7
15 = 3 x 5

Find the first four consecutive integers to have four distinct primes factors. What is the first of these numbers?

static void Main(string[] args)
{
    LinkedList<int> threefactors = new LinkedList<int>();
    int n = 0;
    while (true)
    {
        n++;
        int factcount = 0;
        for (int i = 1; i <= n; i++)
        {
            if (n % i == 0 && isPrime(i))
                factcount++;
        }
        if (factcount == 4)
        {
            threefactors.AddLast(n);
            int lastn = (n - 1);
            int lastn2 = (n - 2);
            int lastn3 = (n - 3);
            if (threefactors.Contains(lastn)  && threefactors.Contains(lastn2) && threefactors.Contains(lastn3))
            {
                Console.WriteLine(n);
                Console.WriteLine(lastn);
                Console.WriteLine(lastn2);
                Console.WriteLine(lastn3);
                break;
            }
        }
    }
}
public static bool isPrime(int n)
{
    if (n == 1)
        return false;
    if (n == 2)
        return true;
    for (int i = 2; i < n; ++i)
    {
        if ((n % i) == 0)
            return false;
    }
    return true;
}

C#: Project Euler Solutions to Problems 9, 13, and 38

For solving ten consecutive problems (1-10).

http://projecteuler.net/problem=9

a2 + b2 = c2

There exists exactly one Pythagorean triplet for which a + b + c = 1000.
Find the product abc.

for (int a = 1; a < 1000; a++)
{
    for (int b = 1; b < 1000; b++)
    {
        double a_pow2 = Math.Pow(a, 2);
        double b_pow2 = Math.Pow(b, 2);
        double c_root = Math.Sqrt(a_pow2 + b_pow2);
        if ((a + b + c_root) == 1000.00)
        {
            Console.WriteLine("a "+a+" b "+b+" c "+c_root+" product : "+(a * b * c_root));
            break;
        }
    }
}

http://projecteuler.net/problem=13

Work out the first ten digits of the sum of the following one-hundred 50-digit numbers.

string[] linerows = totalstring.Split('\n');
long sum = 0;
foreach (string row in linerows)
{
    // Console.WriteLine(row);
    string longnum = row;
    string num11 = longnum.Substring(0, 11);
    long num = long.Parse(num11);
    sum += num;
}
Console.WriteLine(sum.ToString().Substring(0, 10));

http://projecteuler.net/problem=38

What is the largest 1 to 9 pandigital 9-digit number that can be formed as the concatenated product of an integer with (1,2, … , n) where n 1?

static void Main(string[] args)
{
    int number = 0;
    int numberofappend = 2;
    int max = 0;
    while (true)
    {
        number++;
        string catstring = appendmore(number, numberofappend);
        if (catstring.Length > 9)
        {
            number = 0;
            numberofappend++;
        }
        else if  (catstring.Length == 9 && Order(catstring))
        {
            int catint = int.Parse(catstring);
            if (max < catint)
            {
                max = catint;
                Console.WriteLine("max so far : "+ max);
            }
        }
    }
}
public static string appendmore(int num, int howmany)
{
    string cat = "";
    for (int i = 1; i <= howmany; i++)
        cat += "" + num * i;
    return cat;
}
public static bool Order(string n)
{
    string mynum = "" + n;
    int[] int_array = new int[mynum.Length];
    for (int i = 0; i < mynum.Length; i++)
    {
        int_array[i] = int.Parse(mynum[i].ToString());
    }
    Array.Sort(int_array);
    if (int_array[0] == 1)
    {
        int count = 2;
        for (int j = 1; j < int_array.Length; j++)
        {
            if (count == int_array[j])
                count++;
            else
                return false;
        }
    }
    else
        return false;
    return true;
}

C#: Project Euler Solutions to Problems 12 and 41

I have completed 25 Project Euler Questions without cheating. I was awarded a badge that looks like this.

http://projecteuler.net/problem=12

What is the value of the first triangle number to have over five hundred divisors?

int max = 0;
int n = 1;
while(true)
{
    double value = (.5 * n) * (n + 1);
    int devisor_count = 0;
    for (int i = 1; i <= value; i++)
    {
        if (value % i == 0)
            devisor_count++;
    }
    if (max < devisor_count)
    {
        max = devisor_count;
        Console.WriteLine(n + " value  " + value + " devisor Count :" + devisor_count);
    }
    if (devisor_count > 500)
    {
        Console.WriteLine(n + " value  " + value + " FINAL  devisor Count :" + devisor_count);
        break;
    }
    n++;
}

http://projecteuler.net/problem=41

We shall say that an n-digit number is pandigital if it makes use of all the digits 1 to n exactly once. For example, 2143 is a 4-digit pandigital and is also prime.

What is the largest n-digit pandigital prime that exists?

static void Main(string[] args)
{
    int n = 1;
    while (true)
    {
        if (isPrime(n))
        {
            if (Order(n))
                Console.WriteLine(n);
        }
        n++;
    }
}
 
public static bool Order(int n)
{
    string mynum = ""+n;
    int [] int_array = new int[mynum.Length];
    for (int i = 0; i < mynum.Length; i++)
        int_array[i] = int.Parse(mynum[i].ToString());
    Array.Sort(int_array);
    if (int_array[0] == 1)
    {
        int count = 2;
        for (int j = 1; j < int_array.Length; j++)
        {
            if (count == int_array[j])
                count++;
            else 
                return false; 
        }
    }
    else
        return false;
    return true;
}
 
public static bool isPrime(int n)
{
    if (n == 1)
        return false;
    if (n == 2)
        return true;
    for (int i = 2; i < n; ++i)
    {
        if ((n % i) == 0)
            return false;
    }
    return true;
}

C#: Project Euler Solution to Problem 42

http://projecteuler.net/problem=42

The nth term of the sequence of triangle numbers is given by, tn = ½n(n+1); so the first ten triangle numbers are:

1, 3, 6, 10, 15, 21, 28, 36, 45, 55, …

Using words.txt, a 16K text file containing nearly two-thousand common English words, how many are triangle words?

//get a list of 100 triangle numbers
LinkedList<double> tri_nums = new LinkedList<double>();
for( int n = 1; n<101 ; n++)
{
    double value = (.5 * n) * (n + 1);
    tri_nums.AddLast(value);
}
//evaluate word values and compare
StreamReader nameStream;
string fullBook = "";
nameStream = File.OpenText("C:\\Euler\\words.txt");
fullBook = nameStream.ReadToEnd();
nameStream.Close();
string[] names = fullBook.Split(',');
int tri_count = 0;
for (int i = 0; i < names.Length; i++)
{
    int namePosition = (i + 1);
    string name1 = names[i].Replace('"', ' ').Trim();
    int letter_sum = 0;
    for (int j = 0; j < name1.Length; j++)
        letter_sum += (int)(name1[j] - 64);
    if (tri_nums.Contains(letter_sum))
    {
        Console.WriteLine("triangle Word : " +  name1 );
        tri_count++;
    }
}
Console.WriteLine();
Console.WriteLine("Triangle word count : " + tri_count);