Joe, Do you have access to a profiler like Visual VM? It could be that the Regexp is not scaling - i have seen this in my SQL importer project. Just a thought, would be great if you can measure where the slowdown occurs.
/peter Sent from my phone. On Aug 16, 2011 11:09 PM, "Jose Vinicius Pimenta Coletto" < [email protected]> wrote: > Sorry, the source code follows: > > public class InitialDBCreator { > private static final SimpleDateFormat DATE_PARSER = new > SimpleDateFormat("dd/MM/yyyy"); > private static final SimpleDateFormat DATE_FORMATTER = new > SimpleDateFormat("yyyyMMdd"); > > private static final int GRP_DEST_DOC = 1; > private static final int GRP_DEST_NAME = 2; > private static final int GRP_SRC_DOC = 3; > private static final int GRP_SRC_NAME = 5; > private static final int GRP_QUAL = 6; > private static final int GRP_ENTRY_DATE = 7; > private static final int GRP_PART_INT = 8; > private static final int GRP_PART_DEC = 9; > private static final Pattern PTRN_LINE = > Pattern.compile("(\\d{11,14})\\t([^\\t]+)\\t(\\d{11,14})\\t"+ > > "([^\\t]+)\\t([^\\t]+)\\t([^\\t]+)\\t(\\d{2}/\\d{2}/"+ > > "\\d{4})\\t(\\d{1,3}),(\\d{2})%\\t(\\d{2}/\\d{2}/\\d{4})"); > > private final BatchInserter inserter; > private final GraphDatabaseService dbService; > private final BatchInserterIndexProvider indexProvider; > private final BatchInserterIndex index; > > public InitialDBCreator(final String storeDir, final Map<String, String> > config, final String indexName) { > System.out.println("Iniciando inserter..."); > inserter = new BatchInserterImpl(storeDir, config); > dbService = inserter.getGraphDbService(); > System.out.println("Iniciando indexProvider..."); > indexProvider = new LuceneBatchInserterIndexProvider(inserter); > System.out.println("Iniciando index..."); > index = indexProvider.nodeIndex(indexName, MapUtil.stringMap("type", > "exact")); > System.out.println("DB iniciado!"); > Runtime.getRuntime().addShutdownHook( > new Thread() { > @Override > public void run() { > indexProvider.shutdown(); > inserter.shutdown(); > } > }); > } > > public void shutdown() { > index.flush(); > indexProvider.shutdown(); > inserter.shutdown(); > } > > private File prepareNodesFile(final File initialFile) { > File nodesFile = null; > int count; > int countErr; > > try { > System.out.println("Extracting nodes..."); > File tmpFile = File.createTempFile("qsa-tempnodes", ".txt"); > BufferedWriter writer = new BufferedWriter(new FileWriter(tmpFile)); > InputStream in = FUtils.getInputStream(initialFile); > BufferedReader reader = new BufferedReader(new InputStreamReader(in)); > String line = null; > count = 0; > countErr = 0; > while ((line = reader.readLine()) != null) { > Matcher matcher = PTRN_LINE.matcher(line); > if (matcher.matches()) { > String docOne = matcher.group(GRP_SRC_DOC); > String nameOne = matcher.group(GRP_SRC_NAME); > if (!docOne.equals("") && !nameOne.equals("")) { > writer.write(docOne+"|"+nameOne+"\n"); > } > > String docTwo = matcher.group(GRP_DEST_DOC); > String nameTwo = matcher.group(GRP_DEST_NAME); > if (!docTwo.equals("") && !nameTwo.equals("")) { > writer.write(docTwo+"|"+nameTwo+"\n"); > } > count++; > } else { > System.err.println("ERRO: the line '"+line+"' doesn't match the pattern."); > System.err.println("---"); > countErr++; > } > > if (((count > 0) && (count % 5000 == 0)) || ((countErr > 0) && (countErr % > 500 == 0))) { > System.out.print("\r"+count+" rows processed, "+countErr+" erroneous > lines."); > } > } > System.out.println("\r"+count+" rows processed, "+countErr+" erroneous > lines."); > in.close(); > reader.close(); > writer.close(); > > File sortedFile = FUtils.sortFile(tmpFile); > > System.out.println("Unifying nodes..."); > nodesFile = File.createTempFile("qsa-nodes", ".txt"); > writer = new BufferedWriter(new FileWriter(nodesFile)); > in = FUtils.getInputStream(sortedFile); > reader = new BufferedReader(new InputStreamReader(in)); > line = null; > count = 0; > String lastDoc = "-1"; > String lastLine = ""; > while ((line = reader.readLine()) != null) { > String doc = line.substring(0, line.indexOf("|")); > if (!doc.equals(lastDoc) && !lastDoc.equals("-1")) { > writer.write(lastLine+"\n"); > } > lastDoc = doc; > lastLine = line; > count++; > if ((count > 0) && (count % 5000 == 0)) { > System.out.print("\r"+count+" rows processed."); > } > } > writer.write(lastLine+"\n"); > System.out.println("\r"+count+" rows processed."); > in.close(); > reader.close(); > writer.close(); > } catch (IOException e) { > e.printStackTrace(); > } > > return nodesFile; > } > > private void addPerson(final String doc, final String name) { > PersonType tipo = (doc.length() <= 11) ? PersonType.INDIVIDUAL : > PersonType.LEGAL; > > Map<String, Object> pessoaProperties = new HashMap<String, Object>(); > pessoaProperties.put(Person.KEY_DOC , doc); > pessoaProperties.put(Person.KEY_NAME, name); > pessoaProperties.put(Person.KEY_TYPE, tipo.toString()); > > Map<String, Object> indexInfo = new HashMap<String, Object>(); > indexInfo.put(Person.KEY_DOC, doc); > > index.add(inserter.createNode(pessoaProperties), indexInfo); > tipo = null; > pessoaProperties = null; > indexInfo = null; > } > > private void addSociety(final String srcDoc, final String destDoc, final > long entryDate, > final String qualification, final double participation) { > Person source = null; > Person destination = null; > try { > IndexHits<Long> hits = index.get(Person.KEY_DOC, srcDoc); > source = new Person(dbService.getNodeById(hits.getSingle())); > hits = index.get(Person.KEY_DOC, destDoc); > destination = new Person(dbService.getNodeById(hits.getSingle())); > > CorporateRelationship sociedade = source.getSociety(destination); > if (sociedade == null) { > sociedade = source.addSociety(destination, qualification, > participation, entryDate); > } else { > sociedade.setQualification(qualification); > sociedade.setParticipation(participation); > sociedade.setEntryDate(entryDate); > } > } catch (Exception e) { > System.err.println("Error creating society between '"+srcDoc+"' and > '"+destDoc+"'."); > System.err.println("Source : "+source); > System.err.println("Destination: "+destination); > System.err.println(e.getMessage()); > System.err.println("---"); > } > } > > public void createNodes(final File nodesFile) { > System.out.println("Creating nodes..."); > int count = 0; > InputStream in = FUtils.getInputStream(nodesFile); > BufferedReader reader = new BufferedReader(new InputStreamReader(in)); > String line = null; > try { > while ((line = reader.readLine()) != null) { > int i = line.indexOf("|"); > if (i != -1) { > String doc = line.substring(0, i); > String name = line.substring(i+1); > addPerson(doc, name); > doc = null; > name = null; > count++; > } else { > System.err.println("ERROR: invalid line '"+line+"'"); > } > > if (count % 5000 == 0) { > System.out.print("\r"+count+" added nodes."); > } > } > System.out.println("\r"+count+" added nodes."); > } catch (IOException e) { > e.printStackTrace(); > } > } > > public void createRelationships(final File relationshipsFile) { > System.out.println("Creating edges..."); > int count = 0; > int countErr = 0; > InputStream in = FUtils.getInputStream(relationshipsFile); > BufferedReader reader = new BufferedReader(new InputStreamReader(in)); > String line = null; > try { > while ((line = reader.readLine()) != null) {Matcher matcher = > PTRN_LINE.matcher(line); > if (matcher.matches()) { > String srcDoc = matcher.group(GRP_SRC_DOC); > String destDoc = matcher.group(GRP_DEST_DOC); > long entryDate = > Long.parseLong(DATE_FORMATTER.format(DATE_PARSER.parse(matcher.group(GRP_ENTRY_DATE)))); > String qualification = matcher.group(GRP_QUAL); > double participation = > Double.parseDouble(matcher.group(GRP_PART_INT)) / 100 + > > Double.parseDouble(matcher.group(GRP_PART_DEC)) / 10000; > if (!srcDoc.equals(destDoc)) { > addSociety(srcDoc, destDoc, entryDate, qualification, > participation); > count++; > } else { > System.err.println("ERROR: invalid society."); > System.err.println("linha: '"+line+"'"); > System.err.println("---"); > countErr++; > } > } else { > System.err.println("ERROR: the line '"+line+"' doesn't match the pattern."); > System.err.println("---"); > countErr++; > } > > if (((count > 0) && (count % 5000 == 0)) || ((countErr > 0) && (countErr % > 500 == 0))) { > System.out.print("\r"+count+" edges added, "+countErr+" invalid > societies."); > } > } > System.out.println("\r"+count+" edges added, "+countErr+" invalid > societies."); > } catch (IOException e) { > e.printStackTrace(); > } catch (NumberFormatException e) { > e.printStackTrace(); > } catch (ParseException e) { > e.printStackTrace(); > } > } > > public void updateDB(final File file) { > InputStream in = FUtils.getInputStream(file); > BufferedReader reader = new BufferedReader(new InputStreamReader(in)); > String line = null; > try { > while ((line = reader.readLine()) != null) { > Matcher matcher = PTRN_LINE.matcher(line); > if (matcher.matches()) { > String srcDoc = matcher.group(GRP_SRC_DOC); > String srcName = matcher.group(GRP_SRC_NAME); > IndexHits<Long> srcNode = index.get(Person.KEY_DOC, srcDoc); > > String destDoc = matcher.group(GRP_DEST_DOC); > String destName = matcher.group(GRP_DEST_NAME); > > long entryDate = > Long.parseLong(DATE_FORMATTER.format(DATE_PARSER.parse(matcher.group(GRP_ENTRY_DATE)))); > String qualification = matcher.group(GRP_QUAL); > double participation = > Double.parseDouble(matcher.group(GRP_PART_INT)) / 100 + > > Double.parseDouble(matcher.group(GRP_PART_DEC)) / 10000; > } else { > System.err.println("ERRO: the line '"+line+"' doesn't match the pattern."); > System.err.println("---"); > } > } > } catch (IOException e) { > e.printStackTrace(); > } catch (NumberFormatException e) { > // TODO Auto-generated catch block > e.printStackTrace(); > } catch (ParseException e) { > // TODO Auto-generated catch block > e.printStackTrace(); > } > } > > public void createDB(final File initialFile) { > File nodesFile = prepareNodesFile(initialFile); > createNodes(nodesFile); > index.flush(); > createRelationships(initialFile); > } > } > _______________________________________________ > Neo4j mailing list > [email protected] > https://lists.neo4j.org/mailman/listinfo/user _______________________________________________ Neo4j mailing list [email protected] https://lists.neo4j.org/mailman/listinfo/user

