Tuesday, February 25, 2014

Very Old: Java code convert Excel (CSV) file to generate SOLR documents

Scenario is :
   Daily mainframe system generates delta of product data (excel contains rows up to one million rows. Search needs to happen across 30+ product fields.) My initial thoughts are use XSLT however after seeing some computation on some columns, I end up writing some Java code. This code generate one SOLR document xml for each Excel row & with using post.jar file, we used to index the data in one go. (Back in SOLR 1.4 days, JSON format was not that popular & so I used xml. I have XSLT version too. Will post that code very soon.)

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
public class CsvToSolrXmlDoc
{
public String[] columnNames = null;
public  String getColumnName(int i)
{
if ( i>columnNames.length -1 ) return "checkInputData";
        return columnNames[i];
}

public  void initHeaders(String[] cols)
{
columnNames = cols;
}


public  void writeProdInfo(String[] cols) throws Exception {
 if ( cols.length == 0)  return;
 String prodId= cols[0];
 try {
     int idVal = Integer.parseInt(prodId); /// this is my use case. U can ignore this validation
} catch (NumberFormatException e) {
System.out.println("\n Skiping id "+prodId);
return;
}
String outputFile  = "c:\\a_edocs" + "\\"+prodId+".xml"
  try {

DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
Document doc = builder.newDocument();
Element addEle = doc.createElement("add");
doc.appendChild(addEle);
Element root = doc.createElement("doc");
addEle.appendChild(root);
String fname = "field";
for ( int i=0; i
  String cName = getColumnName(i);
                           ////DO ALL column based post processing 
  String value = cols[i].trim();
  if ( value == null) value = "";
  Element node  = doc.createElement(fname);
  Attr attr = doc.createAttribute("name");
  attr.setValue(cName);
  node.setAttributeNode(attr);
  node.appendChild(doc.createTextNode(value));
  root.appendChild(node);
}

DOMSource domSource = new DOMSource(doc);
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
StreamResult result = new StreamResult(new File(outputFile));
transformer.transform(domSource, result);

} catch (Exception e) {
e.printStackTrace();
}

}

public void csvToXML(String inputFile) throws java.io.FileNotFoundException, java.io.IOException
{
BufferedReader br = new BufferedReader(new FileReader(inputFile));
StreamTokenizer st = new StreamTokenizer(br);
String line = null;
int i=0;
while ((line = br.readLine()) != null){
String[] values = line.split(",");
if ( i==0){
for ( int j=0; j
System.out.println("\n col index,name:"+j+":"+values[j]);
}
initHeaders(values);i++;
}else{
try {
writeProdInfo(values);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
        br.close();
}

public static void main(String argv[]) throws java.io.IOException
{
CsvToSolrXmlDoc cp = new CsvToSolrXmlDoc();
cp.csvToXML("C:\\a_work\\three.csv");
}

}

No comments: