The data is a parse of 380,000 usernames. Link Here
C# was written to:
- parse an English dictionary for words. Link here
- parse all tumblr usernames
- search for each English word with a length of 5 or greater within each username
- store each English word found, and count its frequency throughout all names
- Wordle was used to generate a word cloud of the most frequent words.
Most Frequent English Words Found in TUMBLR’s Usernames
C#
DateTime start = DateTime.Now; Console.BufferHeight = 5000; Console.WriteLine("indeed."); char[] alphabet = {'A','B','C','D','E','F','G','H','I','J', 'K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z' }; for (int i = 0; i < alphabet.Length; i++) { string letter = alphabet[i].ToString().ToLower(); StreamReader wordStream; wordStream = File.OpenText("C:\\dictionary\\gcide_" + letter + ".xml"); string fullXMLList = ""; fullXMLList = wordStream.ReadToEnd(); wordStream.Close(); Console.WriteLine("loaded " + letter); Console.Beep(); MatchCollection wordCollection = Regex.Matches(fullXMLList, @"<hw>(.*?)</hw>", RegexOptions.Multiline); for (int j = 0; j < wordCollection.Count; j++) { string word = wordCollection[j].ToString().ToLower(); word = Regex.Replace(word, "<hw>", ""); word = Regex.Replace(word, "</hw>", ""); Regex rgx = new Regex("[^a-zA-Z0-9 -]"); word = rgx.Replace(word, ""); if (word.Length > 4) { dictionary.AddLast(word); } } } for (int h = 0; h < 5; h++) { Console.Beep(); Console.WriteLine("Parsing Tumblr User Names."); } System.IO.StreamReader myFile = new System.IO.StreamReader("C:\\____TUBLRUSR\\l4_blogs.txt"); string myString = myFile.ReadToEnd(); myFile.Close(); string[] parts = myString.Split(','); Dictionary<string, int> freqCount = new Dictionary<string, int>(); int counter = 0; int c = 0; foreach (string item in parts) { string part = Regex.Replace(item, "'", "").Trim().ToLower() ; counter += 1; c += 1; DateTime now = DateTime.Now; if (counter == 100) { Console.WriteLine((380000-c) +" Time ["+(now-start).TotalSeconds+ "] " + part); counter = 0; } foreach (string d_word in dictionary) { if (part.Contains(d_word)) { if (freqCount.ContainsKey(d_word)) { int exist_i = freqCount[d_word]; exist_i += 1; freqCount[d_word] = exist_i; } else freqCount.Add(d_word, 1); } } } Console.WriteLine(); string master_String = ""; int q = 0; foreach (KeyValuePair<string, int> item in freqCount.OrderByDescending(key => key.Value)) { Console.WriteLine(item.Key + " " + item.Value); q += 1; master_String += "data.setValue(" + q + ", 0, '" + item.Key + "');\r\n"; master_String += "data.setValue(" + q + ", 1, " + item.Value + ");\r\n"; } StreamWriter streamWrite; streamWrite = File.AppendText("C:\\____TUBLRUSR\\output.txt"); streamWrite.WriteLine(master_String); streamWrite.Close(); |
Where were you able to find the data for all user names?