C#: How to parse every word in a book and output the frequency of repeating words: The Holy Bible

The book being parsed in this example is the King James Holy Bible bible.txt

Column A is the word, Column B is the frequency

download full output here

microsoft excel format : bibFreqxls.xls

[csharp]
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Text.RegularExpressions;
using System.Collections;

namespace BibleWordFreq
{
class Program
{
static void Main(string[] args)
{
//create a stream reader
StreamReader bookStream;
//string of entire book
string fullBook=””;
//stream to file
bookStream = File.OpenText(“C:\\Book\\bible.txt”);
//read stream to string
fullBook = bookStream.ReadToEnd();
//close stream
bookStream.Close();
//remove numbers and punctuation
fullBook = Regex.Replace(fullBook, “\\.|;|:|,|[0-9]|'”, “”);
//create collection of words
MatchCollection wordCollection = Regex.Matches(fullBook, @”[\w]+”, RegexOptions.Multiline);
//create linked list for words
LinkedList<string> wordList = new LinkedList<string>();
//create hash table
Hashtable frequencyHash = new Hashtable();
//create linked list for unique words
LinkedList<string> uniqueWord = new LinkedList<string>();
//populate wordList with content of collection
for (int i = 0; i < wordCollection.Count; i++)
{
wordList.AddLast(wordCollection[i].ToString().ToLower().Trim());
}
//populate hashtable of word frequency
//for everyword in full word list
foreach (var word in wordList)
{
//if unique linked list does not contain a word, add it as a key, and set the value:1
//if unique linked list contains the word, increment the value
if (uniqueWord.Contains(word))
{
int wordCount = int.Parse(frequencyHash[word].ToString());
wordCount++;
frequencyHash[word] = wordCount;
}
else
{
uniqueWord.AddLast(word);
frequencyHash.Add(word, 1);
}
}

Console.WriteLine(“unique words : “+uniqueWord.Count);
Console.WriteLine(“total words : “+wordList.Count);

//create stream writer
StreamWriter streamWrite;
//create output file
streamWrite = File.AppendText(“C:\\Book\\OUTPUT.txt”);
//for each word+num in frequrncyHash
foreach (string word in frequencyHash.Keys)
{
//Write to file
streamWrite.WriteLine(string.Format(“{0}: {1}”, word, frequencyHash[word]));
}
streamWrite.Close();
}
}
}

[/csharp]

6 thoughts on “C#: How to parse every word in a book and output the frequency of repeating words: The Holy Bible”

Fayrouz Noor 05/18/2012 at 5:43 pm

thanks very much this code help me to much in my GP .

Reply ↓

Venkat B 05/26/2012 at 1:37 am

This is very helpful coding. Can we optimize it using dictionary instead of linked list? Also I need to convert where ever non alphanumeric is there it should be treated as seperator. Can you suggest?

Reply ↓

Farhad 11/08/2013 at 2:22 pm

The following dictionary implementation is about 10 times faster:

using System; using System.IO; using System.Collections.Generic; using System.Text.RegularExpressions;
namespace BibleWordFreq { class Program { static void Main(string[] args) { string fullBook = File.ReadAllText("C:\\Book\\bible.txt").ToLower(); //remove numbers and punctuation fullBook = Regex.Replace(fullBook, "\\.|;|:|,|[0-9]|’", ""); //create collection of words var wordCollection = Regex.Matches(fullBook, @"[\w]+"); //calculate word frequencies var dict = new Dictionary(); for (int i = 0; i < wordCollection.Count; i++) { string word = wordCollection[i].Value; if (!dict.ContainsKey(word)) dict[word] = 1; else ++dict[word]; } Console.WriteLine("unique words : " + dict.Count); Console.WriteLine("total words : " + wordCollection.Count);
using (StreamWriter streamWrite = new StreamWriter("C:\\Book\\OUTPUT.txt")) { foreach (KeyValuePair kv in dict) streamWrite.WriteLine("{0}: {1}", kv.Key, kv.Value); } } } }

Reply ↓
1. Farhad 11/08/2013 at 2:28 pm
  
  Notice: The editor eats up Generic parameters (anything enclosed in angle brackets) even if surrounded by “code” tags. The dictionary should have string as key and int as value.
  
  Reply ↓

Jessica 04/02/2014 at 12:26 pm

Hey how did you get the output in excel file, I did not find any code for that. Please help me with it.

Thanks

Reply ↓

Joseph 01/19/2018 at 10:32 am

How many times is the words “present truth” found in the New King James Bible.

Reply ↓

MantasCode

Parsing different data.

C#: How to parse every word in a book and output the frequency of repeating words: The Holy Bible

6 thoughts on “C#: How to parse every word in a book and output the frequency of repeating words: The Holy Bible”

Leave a Reply Cancel reply