Pages

Tuesday, 10 June 2014

Test Whether a String/Piece of Text is English (Java)

Testing Whether a String/Piece of Text is English

Being able to tell whether a piece of text looks like a valid string has many applications especially in the cryptography field when trying to perform a Ciphertext Only Attack. This form of Brute Force attack will attempt to try every possible key and determine which key is correct by checking the output and selecting the key that formed the plaintext which is most like the English language.  
The extract below was designed for that purpose.  Use the testEnglish() method to find to calculate the score of the key for the piece of plaintext it has output.  Store each of these scores in some kind of structure (ArrayList) and at the end select the key with the highest score.
This was written in the Java language and the letter frequency scores, bigram and trigram scores were all taken based on their appearance in the English language and are percentages.  Remember to edit the alpha, beta and gamma values to your own liking.
//List of letter pairs and percentage score of how common they are in english language
static String[] letterFreq = {" ", "e", "t", "a", "o", "i", "n", "s", "h", "r", "d", "l", "u", "c", "m", "f", "w", "g", "y", "p", "b", "v", "k"};
static Double[] letterFreqScoring = {13.0, 12.6, 9.1, 8.0, 7.6, 7.0, 7.0, 6.3, 6.2, 6.0, 4.3, 4.1, 2.8, 2.6, 2.6, 2.3, 2.2, 2.0, 2.0, 1.8, 1.5, 1.0, 0.7};
 
static String[] bi = {"th", "he", "in", "er", "an", "re", "nd", "on", "en", "at", "ou", "ed", "ha", "to", "or", "it", "is", "hi", "es", "ng"};
static Double[] bigramScoring = {3.8, 3.6, 2.2, 2.2, 2.1, 1.8, 1.6, 1.4, 1.4, 1.3, 1.3, 1.3, 1.3, 1.2, 1.2, 1.1, 1.1, 1.1, 1.1, 1.1};
 
static String[] tri = {"the", "and", "ing", "her", "hat", "his", "tha", "ere", "for", "ent", "ion", "ter", "was", "you", "ith", "ver", "all", "wit", "thi", "tio"};
static Double[] trigramScoring = {8.0, 3.5, 1.6, 1.1, 0.8, 0.7, 0.6, 0.6, 0.6, 0.6, 0.5, 0.5, 0.5, 0.5, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4};
 
static ArrayList<String> bigram = new ArrayList<String>();
static ArrayList<String> trigram = new ArrayList<String>();
static ArrayList<String> letterFrequency = new ArrayList<String>();
 
bigram = new ArrayList<String>(Arrays.asList(bi));
trigram = new ArrayList<String>(Arrays.asList(tri));
letterFrequency = new ArrayList<String>(Arrays.asList(letterFreq));
 
private static double testEnglish(String text){
 
  //Variants to apply a higher score to bigram and trigram and slightly reduce single letter frequency
  int alpha = 0.5;
  int beta = 3;
  int gamma = 7;
 
  int score = 0;
 int i;
 text = text.toLowerCase();
 
 for (i = 0; i < text.length() - 1; i++){   
  if (letterFrequency.contains(text.substring(i, i+1)))
   score += alpha * letterFreqScoring[letterFrequency.indexOf(text.substring(i, i+1))];
 }
 
 for (i = 0; i < text.length() - 2; i++){
  if (bigram.contains(text.substring(i, i+2)))
   score += beta * bigramScoring[bigram.indexOf(text.substring(i, i+2))];
 }
 
 for (i = 0; i < text.length() - 3; i++){
  if (trigram.contains(text.substring(i, i+3)))
   score += gamma * trigramScoring[trigram.indexOf(text.substring(i, i+3))];
 }
 
 return score;
}

1 comment:

  1. Java SE & Java EE article is practical oriented and real time examples. How Java EE address the enterprise development is very important. for that you need a practical orieneted Java Training Courses you need.

    Great Article

    Online Java Training
    Online Java Training
    Java Training Institutes in Chennai
    J2EE training
    Java Training in Chennai
    Java Interview Questions
    Best Recommended books for Spring framework

    ReplyDelete