This is a sample to anonymize VINs i.e. systems sends vehicle data however it is difficult to trace-back to the original VIN except the source system,
Old code.. ( may be usefull. When i did this POC, i tested for 1 millions VINs.
SHA-256 hash function really worked well for this set. Never tested this for 20 million.. But based on N grams this model will work
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.StreamTokenizer;
import java.io.Writer;
import java.util.*;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
///goal is to read millions of vins and aonymization the last 7 digits.
///this way VIN & its data can be share to anyone. however souring systems needs to trace back to
// original VIN ... Steps... Use Java crypto API hash function w.r.t. simple java function
//So far no collisions on one million.. in case collisions,
///thoughts are generating deterministic N grams based on hashed string..
public class VINAnonymization {
public static void main(String[] args) throws IOException {
String stringToEncrypt ="FIXME";
int ctr = 0;
BufferedReader br = new BufferedReader(new FileReader("D:\\demo\\lvins.txt"));
try{
MessageDigest messageDigest;
messageDigest = MessageDigest.getInstance("SHA-256");
HashMap
StreamTokenizer st = new StreamTokenizer(br);
String line = null;
int loop =0;
while ((line = br.readLine()) != null){
messageDigest.update(line.getBytes());
byte[] mdbytes = messageDigest.digest();
StringBuffer hexString = new StringBuffer();
for (int i=0;i
hexString.append(Integer.toHexString(0xFF & mdbytes[i]));
}
String anoVin = null;
String nGramString = LRNGramString(hexString,7,0);
if ( nGramString == null){
System.out.println(loop+++";"+hexString.toString());
continue;
}
anoVin = line.substring(0,9).concat(nGramString);
if ( anomVinMap.containsKey(anoVin) ) {
System.out.println("Collionson for a VIN-->"+anoVin+ "K:"+0);
ctr++;
///Do this for two times... generate 3 ot 10 N grams in single call & loop it
//For now this is OK
nGramString = LRNGramString(hexString,7,1);
if ( nGramString == null) continue;
anoVin = line.substring(0,9).concat(nGramString);
if ( anomVinMap.containsKey(anoVin) ) {
System.out.println("Collionson for a VIN-->"+anoVin+ "K:"+1);
}else{
anomVinMap.put(anoVin, line);
}
ctr++;
nGramString = LRNGramString(hexString,7,2);
if ( nGramString == null) continue;
anoVin = line.substring(0,9).concat(nGramString);
if ( anomVinMap.containsKey(anoVin) ) {
System.out.println("Collionson for a VIN-->"+anoVin+ "K:"+2);
}else{
anomVinMap.put(anoVin, line);
}
}else{
anomVinMap.put(anoVin, line);
}
messageDigest.reset();
}
///Dump the map to CSV file or text file for comparisons
System.out.println("Anonymization VIN map count"+anomVinMap.size());
String eol = System.getProperty("line.separator");
try (Writer writer = new FileWriter("D:\\demo\\out1.csv")) {
for (Map.Entry
writer.append(entry.getKey())
.append(',')
.append(entry.getValue())
.append(eol);
}
writer.flush();
writer.close();
} catch (IOException ex) {
ex.printStackTrace(System.err);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
if (br != null ){
br.close();
}
}
}
//validation later
static String LRNGramString (StringBuffer orgString, int legth, int position ){
if ( orgString == null ) return null;
return orgString.substring(position,legth).toUpperCase();
}
}