Words used in Tumblr’s usernames (380,000 users)

The data is a parse of 380,000 usernames. Link Here

C# was written to:

  • parse an English dictionary for words. Link here
  • parse all tumblr usernames
  • search for each English word with a length of 5 or greater within each username
  • store each English word found, and count its frequency throughout all names
  • Wordle was used to generate a word cloud of the most frequent words.

Most Frequent English Words Found in TUMBLR’s Usernames
TUMBLR_USERNAME_ENGLISHWORDS

C#

DateTime start = DateTime.Now;
Console.BufferHeight = 5000;
Console.WriteLine("indeed.");
 
char[] alphabet = {'A','B','C','D','E','F','G','H','I','J',
'K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z' };
 
for (int i = 0; i < alphabet.Length; i++)
{
    string letter = alphabet[i].ToString().ToLower();
    StreamReader wordStream;
    wordStream = File.OpenText("C:\\dictionary\\gcide_" 
    + letter + ".xml");
 
    string fullXMLList = "";
    fullXMLList = wordStream.ReadToEnd();
    wordStream.Close();
 
    Console.WriteLine("loaded " + letter);
    Console.Beep();
 
    MatchCollection wordCollection = Regex.Matches(fullXMLList, @"<hw>(.*?)</hw>", RegexOptions.Multiline);
    for (int j = 0; j < wordCollection.Count; j++)
    {
        string word = wordCollection[j].ToString().ToLower();
        word = Regex.Replace(word, "<hw>", "");
        word = Regex.Replace(word, "</hw>", "");
        Regex rgx = new Regex("[^a-zA-Z0-9 -]");
        word = rgx.Replace(word, "");
        if (word.Length > 4)
        {
            dictionary.AddLast(word);
        }
    }
}
for (int h = 0; h < 5; h++)
{
    Console.Beep();
    Console.WriteLine("Parsing Tumblr User Names.");
}
System.IO.StreamReader myFile = new System.IO.StreamReader("C:\\____TUBLRUSR\\l4_blogs.txt");
string myString = myFile.ReadToEnd();
 
myFile.Close();
string[] parts = myString.Split(',');
Dictionary<string, int> freqCount = new Dictionary<string, int>();
int counter = 0;
int c = 0;
foreach (string item in parts)
{
    string part = Regex.Replace(item, "'", "").Trim().ToLower() ;
    counter += 1;
    c += 1;
    DateTime now = DateTime.Now;
    if (counter == 100)
    {
        Console.WriteLine((380000-c) +"   Time  ["+(now-start).TotalSeconds+ "]    " +  part);
        counter = 0;
    }
    foreach (string d_word in dictionary)
    {
        if (part.Contains(d_word))
        {
            if (freqCount.ContainsKey(d_word))
            {
                int exist_i = freqCount[d_word];
                exist_i += 1;
                freqCount[d_word] = exist_i;
            }
            else
                freqCount.Add(d_word, 1);
        }
    }
}
 
Console.WriteLine();
string master_String = "";
int q = 0;
foreach (KeyValuePair<string, int> item in freqCount.OrderByDescending(key => key.Value))
{
    Console.WriteLine(item.Key + "   " + item.Value);
    q += 1;
    master_String += "data.setValue(" + q + ", 0, '" + item.Key + "');\r\n";
    master_String += "data.setValue(" + q + ", 1, " + item.Value + ");\r\n";
}
StreamWriter streamWrite;
streamWrite = File.AppendText("C:\\____TUBLRUSR\\output.txt");
streamWrite.WriteLine(master_String);
streamWrite.Close();

One thought on “Words used in Tumblr’s usernames (380,000 users)

Leave a Reply

Your email address will not be published. Required fields are marked *