Thursday, March 15, 2018

VIN anonymization

Data anonymization is a type of information sanitation whose intent is privacy protection. It is the process of either encrypting or removing personally identifiable information from data sets, so that the people whom the data describe remain anonymous.
This is a sample to anonymize VINs i.e. systems sends vehicle data however it is difficult to trace-back to the original VIN except the source system,

Old code.. ( may be usefull. When i did this POC, i tested for 1 millions VINs.
SHA-256 hash function really worked well for this set. Never tested this for 20 million.. But based on N grams this model will work


import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.StreamTokenizer;
import java.io.Writer;
import java.util.*;

import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;


///goal is to read  millions of vins and aonymization the last 7 digits.
///this way VIN & its data can be share to anyone. however souring systems needs to trace back to 
// original VIN ... Steps... Use Java crypto API hash function w.r.t. simple java function
//So far no collisions on one million.. in case collisions, 
///thoughts are generating deterministic N grams based on hashed string..

public class VINAnonymization  {

   
   public static void main(String[] args) throws IOException {
   String stringToEncrypt ="FIXME";
       int ctr = 0;
    BufferedReader br = new BufferedReader(new FileReader("D:\\demo\\lvins.txt")); 
    
  try{
  
       MessageDigest messageDigest; 
       messageDigest = MessageDigest.getInstance("SHA-256");
       HashMap anomVinMap = new HashMap();
   StreamTokenizer st = new StreamTokenizer(br);
   String line = null;

   int loop =0;
while ((line = br.readLine()) != null){

   messageDigest.update(line.getBytes());
   byte[] mdbytes = messageDigest.digest();

      StringBuffer hexString = new StringBuffer();
    for (int i=0;i
        hexString.append(Integer.toHexString(0xFF & mdbytes[i]));
    }
    
    String anoVin = null;
  String nGramString = LRNGramString(hexString,7,0);
  if ( nGramString == null){
  System.out.println(loop+++";"+hexString.toString());
  continue;
  }
  anoVin = line.substring(0,9).concat(nGramString);
  if ( anomVinMap.containsKey(anoVin) ) {
  System.out.println("Collionson for a VIN-->"+anoVin+ "K:"+0);
  ctr++;
  ///Do this for two times... generate 3 ot 10 N grams in single call  & loop it
  //For now this is OK
  nGramString = LRNGramString(hexString,7,1);
  if ( nGramString == null) continue;
  anoVin = line.substring(0,9).concat(nGramString);
  if ( anomVinMap.containsKey(anoVin) ) {
  System.out.println("Collionson for a VIN-->"+anoVin+ "K:"+1);
  }else{
  anomVinMap.put(anoVin, line);
  }
  ctr++;
  nGramString = LRNGramString(hexString,7,2);
  if ( nGramString == null) continue;
  anoVin = line.substring(0,9).concat(nGramString);
  if ( anomVinMap.containsKey(anoVin) ) {
  System.out.println("Collionson for a VIN-->"+anoVin+ "K:"+2);
  }else{
  anomVinMap.put(anoVin, line);
  }
  }else{
  anomVinMap.put(anoVin, line);
  }
    messageDigest.reset();
  }

///Dump the map to CSV file or text file for comparisons
  System.out.println("Anonymization VIN map count"+anomVinMap.size());
  String eol = System.getProperty("line.separator");

  try (Writer writer = new FileWriter("D:\\demo\\out1.csv")) {
  for (Map.Entry entry : anomVinMap.entrySet()) {
    writer.append(entry.getKey())
          .append(',')
          .append(entry.getValue())
          .append(eol);
  }
  writer.flush();
  writer.close();
} catch (IOException ex) {
  ex.printStackTrace(System.err);
      }

  

} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
if (br != null ){
br.close();
}
}

         
   }
   
   //validation later
   static String LRNGramString (StringBuffer orgString, int legth, int position ){
   if ( orgString == null ) return null;
   return orgString.substring(position,legth).toUpperCase();
   
   }

}