Re: [Neo4j] Memory overflow while creating big graph

Peter Neubauer Tue, 16 Aug 2011 22:28:42 -0700

Joe,
Do you have access to a profiler like Visual VM? It could be that the Regexp
is not scaling - i have seen this in my SQL importer project. Just a
thought, would be great if you can measure where the slowdown occurs.


/peter

Sent from my phone.
On Aug 16, 2011 11:09 PM, "Jose Vinicius Pimenta Coletto" <
[email protected]> wrote:
> Sorry, the source code follows:
>
> public class InitialDBCreator {
> private static final SimpleDateFormat DATE_PARSER = new
> SimpleDateFormat("dd/MM/yyyy");
> private static final SimpleDateFormat DATE_FORMATTER = new
> SimpleDateFormat("yyyyMMdd");
>
> private static final int GRP_DEST_DOC = 1;
> private static final int GRP_DEST_NAME = 2;
> private static final int GRP_SRC_DOC = 3;
> private static final int GRP_SRC_NAME = 5;
> private static final int GRP_QUAL = 6;
> private static final int GRP_ENTRY_DATE = 7;
> private static final int GRP_PART_INT = 8;
> private static final int GRP_PART_DEC = 9;
> private static final Pattern PTRN_LINE =
> Pattern.compile("(\\d{11,14})\\t([^\\t]+)\\t(\\d{11,14})\\t"+
>
> "([^\\t]+)\\t([^\\t]+)\\t([^\\t]+)\\t(\\d{2}/\\d{2}/"+
>
> "\\d{4})\\t(\\d{1,3}),(\\d{2})%\\t(\\d{2}/\\d{2}/\\d{4})");
>
> private final BatchInserter inserter;
> private final GraphDatabaseService dbService;
> private final BatchInserterIndexProvider indexProvider;
> private final BatchInserterIndex index;
>
> public InitialDBCreator(final String storeDir, final Map<String, String>
> config, final String indexName) {
> System.out.println("Iniciando inserter...");
> inserter = new BatchInserterImpl(storeDir, config);
> dbService = inserter.getGraphDbService();
> System.out.println("Iniciando indexProvider...");
> indexProvider = new LuceneBatchInserterIndexProvider(inserter);
> System.out.println("Iniciando index...");
> index = indexProvider.nodeIndex(indexName, MapUtil.stringMap("type",
> "exact"));
> System.out.println("DB iniciado!");
> Runtime.getRuntime().addShutdownHook(
> new Thread() {
> @Override
> public void run() {
> indexProvider.shutdown();
> inserter.shutdown();
> }
> });
> }
>
> public void shutdown() {
> index.flush();
> indexProvider.shutdown();
> inserter.shutdown();
> }
>
> private File prepareNodesFile(final File initialFile) {
> File nodesFile = null;
> int count;
> int countErr;
>
> try {
> System.out.println("Extracting nodes...");
> File tmpFile = File.createTempFile("qsa-tempnodes", ".txt");
> BufferedWriter writer = new BufferedWriter(new FileWriter(tmpFile));
> InputStream in = FUtils.getInputStream(initialFile);
> BufferedReader reader = new BufferedReader(new InputStreamReader(in));
> String line = null;
> count = 0;
> countErr = 0;
> while ((line = reader.readLine()) != null) {
> Matcher matcher = PTRN_LINE.matcher(line);
> if (matcher.matches()) {
> String docOne = matcher.group(GRP_SRC_DOC);
> String nameOne = matcher.group(GRP_SRC_NAME);
> if (!docOne.equals("") && !nameOne.equals("")) {
> writer.write(docOne+"|"+nameOne+"\n");
> }
>
> String docTwo = matcher.group(GRP_DEST_DOC);
> String nameTwo = matcher.group(GRP_DEST_NAME);
> if (!docTwo.equals("") && !nameTwo.equals("")) {
> writer.write(docTwo+"|"+nameTwo+"\n");
> }
> count++;
> } else {
> System.err.println("ERRO: the line '"+line+"' doesn't match the
pattern.");
> System.err.println("---");
> countErr++;
> }
>
> if (((count > 0) && (count % 5000 == 0)) || ((countErr > 0) && (countErr %
> 500 == 0))) {
> System.out.print("\r"+count+" rows processed, "+countErr+" erroneous
> lines.");
> }
> }
> System.out.println("\r"+count+" rows processed, "+countErr+" erroneous
> lines.");
> in.close();
> reader.close();
> writer.close();
>
> File sortedFile = FUtils.sortFile(tmpFile);
>
> System.out.println("Unifying nodes...");
> nodesFile = File.createTempFile("qsa-nodes", ".txt");
> writer = new BufferedWriter(new FileWriter(nodesFile));
> in = FUtils.getInputStream(sortedFile);
> reader = new BufferedReader(new InputStreamReader(in));
> line = null;
> count = 0;
> String lastDoc = "-1";
> String lastLine = "";
> while ((line = reader.readLine()) != null) {
> String doc = line.substring(0, line.indexOf("|"));
> if (!doc.equals(lastDoc) && !lastDoc.equals("-1")) {
> writer.write(lastLine+"\n");
> }
> lastDoc = doc;
> lastLine = line;
> count++;
> if ((count > 0) && (count % 5000 == 0)) {
> System.out.print("\r"+count+" rows processed.");
> }
> }
> writer.write(lastLine+"\n");
> System.out.println("\r"+count+" rows processed.");
> in.close();
> reader.close();
> writer.close();
> } catch (IOException e) {
> e.printStackTrace();
> }
>
> return nodesFile;
> }
>
> private void addPerson(final String doc, final String name) {
> PersonType tipo = (doc.length() <= 11) ? PersonType.INDIVIDUAL :
> PersonType.LEGAL;
>
> Map<String, Object> pessoaProperties = new HashMap<String, Object>();
> pessoaProperties.put(Person.KEY_DOC , doc);
> pessoaProperties.put(Person.KEY_NAME, name);
> pessoaProperties.put(Person.KEY_TYPE, tipo.toString());
>
> Map<String, Object> indexInfo = new HashMap<String, Object>();
> indexInfo.put(Person.KEY_DOC, doc);
>
> index.add(inserter.createNode(pessoaProperties), indexInfo);
> tipo = null;
> pessoaProperties = null;
> indexInfo = null;
> }
>
> private void addSociety(final String srcDoc, final String destDoc, final
> long entryDate,
> final String qualification, final double participation) {
> Person source = null;
> Person destination = null;
> try {
> IndexHits<Long> hits = index.get(Person.KEY_DOC, srcDoc);
> source = new Person(dbService.getNodeById(hits.getSingle()));
> hits = index.get(Person.KEY_DOC, destDoc);
> destination = new Person(dbService.getNodeById(hits.getSingle()));
>
> CorporateRelationship sociedade = source.getSociety(destination);
> if (sociedade == null) {
> sociedade = source.addSociety(destination, qualification,
> participation, entryDate);
> } else {
> sociedade.setQualification(qualification);
> sociedade.setParticipation(participation);
> sociedade.setEntryDate(entryDate);
> }
> } catch (Exception e) {
> System.err.println("Error creating society between '"+srcDoc+"' and
> '"+destDoc+"'.");
> System.err.println("Source : "+source);
> System.err.println("Destination: "+destination);
> System.err.println(e.getMessage());
> System.err.println("---");
> }
> }
>
> public void createNodes(final File nodesFile) {
> System.out.println("Creating nodes...");
> int count = 0;
> InputStream in = FUtils.getInputStream(nodesFile);
> BufferedReader reader = new BufferedReader(new InputStreamReader(in));
> String line = null;
> try {
> while ((line = reader.readLine()) != null) {
> int i = line.indexOf("|");
> if (i != -1) {
> String doc = line.substring(0, i);
> String name = line.substring(i+1);
> addPerson(doc, name);
> doc = null;
> name = null;
> count++;
> } else {
> System.err.println("ERROR: invalid line '"+line+"'");
> }
>
> if (count % 5000 == 0) {
> System.out.print("\r"+count+" added nodes.");
> }
> }
> System.out.println("\r"+count+" added nodes.");
> } catch (IOException e) {
> e.printStackTrace();
> }
> }
>
> public void createRelationships(final File relationshipsFile) {
> System.out.println("Creating edges...");
> int count = 0;
> int countErr = 0;
> InputStream in = FUtils.getInputStream(relationshipsFile);
> BufferedReader reader = new BufferedReader(new InputStreamReader(in));
> String line = null;
> try {
> while ((line = reader.readLine()) != null) {Matcher matcher =
> PTRN_LINE.matcher(line);
> if (matcher.matches()) {
> String srcDoc = matcher.group(GRP_SRC_DOC);
> String destDoc = matcher.group(GRP_DEST_DOC);
> long entryDate =
>
Long.parseLong(DATE_FORMATTER.format(DATE_PARSER.parse(matcher.group(GRP_ENTRY_DATE))));
> String qualification = matcher.group(GRP_QUAL);
> double participation =
> Double.parseDouble(matcher.group(GRP_PART_INT)) / 100 +
>
> Double.parseDouble(matcher.group(GRP_PART_DEC)) / 10000;
> if (!srcDoc.equals(destDoc)) {
> addSociety(srcDoc, destDoc, entryDate, qualification,
> participation);
> count++;
> } else {
> System.err.println("ERROR: invalid society.");
> System.err.println("linha: '"+line+"'");
> System.err.println("---");
> countErr++;
> }
> } else {
> System.err.println("ERROR: the line '"+line+"' doesn't match the
pattern.");
> System.err.println("---");
> countErr++;
> }
>
> if (((count > 0) && (count % 5000 == 0)) || ((countErr > 0) && (countErr %
> 500 == 0))) {
> System.out.print("\r"+count+" edges added, "+countErr+" invalid
> societies.");
> }
> }
> System.out.println("\r"+count+" edges added, "+countErr+" invalid
> societies.");
> } catch (IOException e) {
> e.printStackTrace();
> } catch (NumberFormatException e) {
> e.printStackTrace();
> } catch (ParseException e) {
> e.printStackTrace();
> }
> }
>
> public void updateDB(final File file) {
> InputStream in = FUtils.getInputStream(file);
> BufferedReader reader = new BufferedReader(new InputStreamReader(in));
> String line = null;
> try {
> while ((line = reader.readLine()) != null) {
> Matcher matcher = PTRN_LINE.matcher(line);
> if (matcher.matches()) {
> String srcDoc = matcher.group(GRP_SRC_DOC);
> String srcName = matcher.group(GRP_SRC_NAME);
> IndexHits<Long> srcNode = index.get(Person.KEY_DOC, srcDoc);
>
> String destDoc = matcher.group(GRP_DEST_DOC);
> String destName = matcher.group(GRP_DEST_NAME);
>
> long entryDate =
>
Long.parseLong(DATE_FORMATTER.format(DATE_PARSER.parse(matcher.group(GRP_ENTRY_DATE))));
> String qualification = matcher.group(GRP_QUAL);
> double participation =
> Double.parseDouble(matcher.group(GRP_PART_INT)) / 100 +
>
> Double.parseDouble(matcher.group(GRP_PART_DEC)) / 10000;
> } else {
> System.err.println("ERRO: the line '"+line+"' doesn't match the
pattern.");
> System.err.println("---");
> }
> }
> } catch (IOException e) {
> e.printStackTrace();
> } catch (NumberFormatException e) {
> // TODO Auto-generated catch block
> e.printStackTrace();
> } catch (ParseException e) {
> // TODO Auto-generated catch block
> e.printStackTrace();
> }
> }
>
> public void createDB(final File initialFile) {
> File nodesFile = prepareNodesFile(initialFile);
> createNodes(nodesFile);
> index.flush();
> createRelationships(initialFile);
> }
> }
> _______________________________________________
> Neo4j mailing list
> [email protected]
> https://lists.neo4j.org/mailman/listinfo/user
_______________________________________________
Neo4j mailing list
[email protected]
https://lists.neo4j.org/mailman/listinfo/user

Re: [Neo4j] Memory overflow while creating big graph

Reply via email to