Scenario is :
Daily mainframe system generates delta of product
data (excel contains rows up to one million rows. Search needs to happen across
30+ product fields.) My initial thoughts are use XSLT however after seeing some
computation on some columns, I end up writing some Java code. This code
generate one SOLR document xml for each Excel row & with using post.jar
file, we used to index the data in one go. (Back in SOLR 1.4 days, JSON format
was not that popular & so I used xml. I have XSLT version too. Will post
that code very soon.)
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
public class CsvToSolrXmlDoc
{
public String[] columnNames = null;
public String getColumnName(int i)
{
if ( i>columnNames.length -1 ) return "checkInputData";
return columnNames[i];
}
public void initHeaders(String[] cols)
{
columnNames = cols;
}
public void writeProdInfo(String[] cols) throws Exception {
if ( cols.length == 0) return;
String prodId= cols[0];
try {
int idVal = Integer.parseInt(prodId); /// this is my use case. U can ignore this validation
} catch (NumberFormatException e) {
System.out.println("\n Skiping id "+prodId);
return;
}
String outputFile = "c:\\a_edocs" + "\\"+prodId+".xml"
try {
DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
Document doc = builder.newDocument();
Element addEle = doc.createElement("add");
doc.appendChild(addEle);
Element root = doc.createElement("doc");
addEle.appendChild(root);
String fname = "field";
for ( int i=0; i
String cName = getColumnName(i);
////DO ALL column based post processing
String value = cols[i].trim();
if ( value == null) value = "";
Element node = doc.createElement(fname);
Attr attr = doc.createAttribute("name");
attr.setValue(cName);
node.setAttributeNode(attr);
node.appendChild(doc.createTextNode(value));
root.appendChild(node);
}
DOMSource domSource = new DOMSource(doc);
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
StreamResult result = new StreamResult(new File(outputFile));
transformer.transform(domSource, result);
} catch (Exception e) {
e.printStackTrace();
}
}
public void csvToXML(String inputFile) throws java.io.FileNotFoundException, java.io.IOException
{
BufferedReader br = new BufferedReader(new FileReader(inputFile));
StreamTokenizer st = new StreamTokenizer(br);
String line = null;
int i=0;
while ((line = br.readLine()) != null){
String[] values = line.split(",");
if ( i==0){
for ( int j=0; j
System.out.println("\n col index,name:"+j+":"+values[j]);
}
initHeaders(values);i++;
}else{
try {
writeProdInfo(values);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
br.close();
}
public static void main(String argv[]) throws java.io.IOException
{
CsvToSolrXmlDoc cp = new CsvToSolrXmlDoc();
cp.csvToXML("C:\\a_work\\three.csv");
}
}
No comments:
Post a Comment