New reply on DataCleaner's online discussion forum 
(http://datacleaner.org/forum):

djwizzdom replied to subject 'Duplicate Detection never finds any duplicates'

-------------------

Tried this twice now with the same results. First time did a lot of training, 
basically every single pair it suggested to me I marked as unique or duplicate. 
Entire record set is ~14,000 records.

Next time I only spent 5 minutes of so training it, not as many uniques but 
enough duplicates to get into the good bracket and had the same  no duplicates 
found again.

Here is the content of the model in case that helps:

{{{
<?xml version="1.0" encoding="UTF-8" standalone="true"?>

-<deduplication-model version="1" threshold="0.545">


-<columns>

<column name="Account_Name" index="0"/>

<column name="Invoicing_City" index="1"/>

<column name="Invoicing_Country" index="2"/>

<column name="Invoicing_Province" index="3"/>

<column name="Invoicing_Address_1" index="4"/>

<column name="Invoicing_Address_2" index="5"/>

<column name="Invoicing_Address_3" index="6"/>

<column name="Web_Site" index="7"/>

</columns>


-<preselection-model>


-<subset>

<item column-index="0"/>

</subset>


-<subset>

<item column-index="4"/>

</subset>


-<subset>

<item column-index="1"/>

<item column-index="7"/>

</subset>


-<subset>

<item column-index="1"/>

<item column-index="3"/>

</subset>


-<subset>

<item column-index="3"/>

<item column-index="7"/>

</subset>


-<subset>

<item column-index="1"/>

<item column-index="6"/>

</subset>

</preselection-model>


-<matching-model>


-<rule fixed-score="false">

<condition threshold="0.960536398467433" column-index="0" 
operator="greater-than" feature-modifier="multiple-parts" 
feature="edit-distance"/>

<condition threshold="0.16666666666666669" column-index="3" 
operator="less-than" feature-modifier="empty-low" feature="edit-distance"/>

<condition threshold="0.7083333333333333" column-index="0" 
operator="greater-than" feature-modifier="multiple-parts" feature="phonetic"/>

<condition threshold="0.5" column-index="3" operator="greater-than" 
feature-modifier="unmodified" feature="exact"/>

</rule>


-<rule fixed-score="true" score="0.0">

<condition threshold="0.960536398467433" column-index="0" operator="less-than" 
feature-modifier="multiple-parts" feature="edit-distance"/>

<condition threshold="0.96875" column-index="4" operator="less-than" 
feature-modifier="multiple-parts" feature="exact"/>

<condition threshold="0.5" column-index="7" operator="less-than" 
feature-modifier="empty-low" feature="exact"/>

</rule>


-<rule fixed-score="false">

<condition threshold="0.960536398467433" column-index="0" operator="less-than" 
feature-modifier="multiple-parts" feature="edit-distance"/>

<condition threshold="0.96875" column-index="4" operator="greater-than" 
feature-modifier="multiple-parts" feature="exact"/>

<condition threshold="0.9510416666666667" column-index="7" 
operator="greater-than" feature-modifier="empty-high" feature="edit-distance"/>

</rule>


-<rule fixed-score="false">

<condition threshold="0.960536398467433" column-index="0" 
operator="greater-than" feature-modifier="multiple-parts" 
feature="edit-distance"/>

<condition threshold="0.16666666666666669" column-index="3" 
operator="less-than" feature-modifier="empty-low" feature="edit-distance"/>

<condition threshold="0.7083333333333333" column-index="0" 
operator="greater-than" feature-modifier="multiple-parts" feature="phonetic"/>

</rule>


-<rule fixed-score="false">

<condition threshold="0.960536398467433" column-index="0" 
operator="greater-than" feature-modifier="multiple-parts" 
feature="edit-distance"/>

<condition threshold="0.16666666666666669" column-index="3" 
operator="greater-than" feature-modifier="empty-low" feature="edit-distance"/>

<condition threshold="0.5277777777777778" column-index="4" operator="less-than" 
feature-modifier="multiple-parts" feature="exact"/>

</rule>


-<rule fixed-score="false">

<condition threshold="0.960536398467433" column-index="0" 
operator="greater-than" feature-modifier="multiple-parts" 
feature="edit-distance"/>

<condition threshold="0.16666666666666669" column-index="3" 
operator="greater-than" feature-modifier="empty-low" feature="edit-distance"/>

</rule>


-<rule fixed-score="false">

<condition threshold="0.960536398467433" column-index="0" 
operator="greater-than" feature-modifier="multiple-parts" 
feature="edit-distance"/>

<condition threshold="0.29166666666666663" column-index="4" 
operator="greater-than" feature-modifier="multiple-parts" feature="phonetic"/>

</rule>


-<rule fixed-score="true" score="0.0">

<condition threshold="0.960536398467433" column-index="0" operator="less-than" 
feature-modifier="multiple-parts" feature="edit-distance"/>

<condition threshold="0.96875" column-index="4" operator="greater-than" 
feature-modifier="multiple-parts" feature="exact"/>

</rule>


-<rule fixed-score="false">

<condition threshold="0.960536398467433" column-index="0" operator="less-than" 
feature-modifier="multiple-parts" feature="edit-distance"/>

</rule>

</matching-model>


-<scoring-model bias="-12.708377067484543">

<term column-index="0" feature-modifier="unmodified" feature="exact" 
weight="0.24687822521150035"/>

<term column-index="0" feature-modifier="empty-high" feature="exact" 
weight="0.06909461286974289"/>

<term column-index="0" feature-modifier="empty-low" feature="exact" 
weight="0.6693098514367335"/>

<term column-index="0" feature-modifier="multiple-parts" feature="exact" 
weight="0.49027617858588685"/>

<term column-index="0" feature-modifier="unmodified" feature="edit-distance" 
weight="-0.393977423875355"/>

<term column-index="0" feature-modifier="empty-high" feature="edit-distance" 
weight="0.8382673010958311"/>

<term column-index="0" feature-modifier="empty-low" feature="edit-distance" 
weight="0.2620273723143973"/>

<term column-index="0" feature-modifier="multiple-parts" 
feature="edit-distance" weight="5.3298091995844885"/>

<term column-index="0" feature-modifier="unmodified" feature="phonetic" 
weight="1.546195726647125"/>

<term column-index="0" feature-modifier="empty-high" feature="phonetic" 
weight="2.054248998741934"/>

<term column-index="0" feature-modifier="empty-low" feature="phonetic" 
weight="1.5676017522878867"/>

<term column-index="0" feature-modifier="multiple-parts" feature="phonetic" 
weight="3.3670475962386983"/>

<term column-index="3" feature-modifier="unmodified" feature="exact" 
weight="0.9383892669550286"/>

<term column-index="3" feature-modifier="empty-high" feature="exact" 
weight="-0.2808323750371145"/>

<term column-index="3" feature-modifier="empty-low" feature="exact" 
weight="-0.8538752684120915"/>

<term column-index="3" feature-modifier="multiple-parts" feature="exact" 
weight="-0.9052826318644718"/>

<term column-index="3" feature-modifier="unmodified" feature="edit-distance" 
weight="0.4598029505175806"/>

<term column-index="3" feature-modifier="empty-high" feature="edit-distance" 
weight="0.9119928315676423"/>

<term column-index="3" feature-modifier="empty-low" feature="edit-distance" 
weight="-0.04576096508200925"/>

<term column-index="3" feature-modifier="multiple-parts" 
feature="edit-distance" weight="0.11284105101995477"/>

<term column-index="3" feature-modifier="unmodified" feature="phonetic" 
weight="0.20941563285908674"/>

<term column-index="3" feature-modifier="empty-high" feature="phonetic" 
weight="-0.7113089521973885"/>

<term column-index="3" feature-modifier="empty-low" feature="phonetic" 
weight="1.2884482486809476"/>

<term column-index="3" feature-modifier="multiple-parts" feature="phonetic" 
weight="0.20345880850841677"/>

<term column-index="4" feature-modifier="unmodified" feature="exact" 
weight="1.9429488028007005"/>

<term column-index="4" feature-modifier="empty-high" feature="exact" 
weight="-2.1488874699447504"/>

<term column-index="4" feature-modifier="empty-low" feature="exact" 
weight="4.239860638771215"/>

<term column-index="4" feature-modifier="multiple-parts" feature="exact" 
weight="6.41400155345129"/>

<term column-index="4" feature-modifier="unmodified" feature="edit-distance" 
weight="0.7305529021980784"/>

<term column-index="4" feature-modifier="empty-high" feature="edit-distance" 
weight="-0.8434431318313502"/>

<term column-index="4" feature-modifier="empty-low" feature="edit-distance" 
weight="0.9497824163806728"/>

<term column-index="4" feature-modifier="multiple-parts" 
feature="edit-distance" weight="-7.497073140203304"/>

<term column-index="4" feature-modifier="unmodified" feature="phonetic" 
weight="-1.6006798608791546"/>

<term column-index="4" feature-modifier="empty-high" feature="phonetic" 
weight="-1.1779788253368457"/>

<term column-index="4" feature-modifier="empty-low" feature="phonetic" 
weight="1.1468433388762949"/>

<term column-index="4" feature-modifier="multiple-parts" feature="phonetic" 
weight="0.10606070884978461"/>

<term column-index="7" feature-modifier="unmodified" feature="exact" 
weight="-1.2971955922419627"/>

<term column-index="7" feature-modifier="empty-high" feature="exact" 
weight="0.8776221941303555"/>

<term column-index="7" feature-modifier="empty-low" feature="exact" 
weight="-0.04544815782110162"/>

<term column-index="7" feature-modifier="multiple-parts" feature="exact" 
weight="-0.19030345849865596"/>

<term column-index="7" feature-modifier="unmodified" feature="edit-distance" 
weight="0.7437183589830544"/>

<term column-index="7" feature-modifier="empty-high" feature="edit-distance" 
weight="3.0210727274350453"/>

<term column-index="7" feature-modifier="empty-low" feature="edit-distance" 
weight="-0.2908952575906266"/>

<term column-index="7" feature-modifier="multiple-parts" 
feature="edit-distance" weight="-1.1220109194414383"/>

<term column-index="7" feature-modifier="unmodified" feature="phonetic" 
weight="-0.5893521767726607"/>

<term column-index="7" feature-modifier="empty-high" feature="phonetic" 
weight="0.15572443988337026"/>

<term column-index="7" feature-modifier="empty-low" feature="phonetic" 
weight="5.998873570612469"/>

<term column-index="7" feature-modifier="multiple-parts" feature="phonetic" 
weight="1.9126207343098762"/>

</scoring-model>

</deduplication-model>
}}}

-------------------

View the topic online to reply - go to 
http://datacleaner.org/topic/1052/Duplicate-Detection-never-finds-any-duplicates

-- 
You received this message because you are subscribed to the Google Groups 
"DataCleaner-notify" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/datacleaner-notify.
For more options, visit https://groups.google.com/d/optout.

Reply via email to