
# coding: utf-8

# In[37]:


import pandas as pd
import os, glob, tqdm


# Parsing excel files and merging them into a combined dataframe

# In[74]:


dataframes = []
for file in tqdm.tqdm(glob.glob('Microsoft_corpus_*')):
    df = pd.read_excel(file)
    if (df.shape[1] != 2):
        print("Fix number of colums to 2 in file " + file + ". Currently it has " + str(df.shape[1]) + ". Ignoring it.")
    else:
        df = pd.read_excel(file, names=['L1', 'L2'])
        #df = pd.DataFrame(df, dtype = str, columns = ["L1", "L2"])
        dataframes.append(df)


# In[75]:


combined_dataframe = dataframes[0]
for d in dataframes[1:]:
    combined_dataframe = combined_dataframe.append(d)
    print(d.shape, combined_dataframe.shape)
combined_dataframe.shape


# In[78]:


sum([x.shape[0] for x in dataframes])


# In[79]:


dedup_frame = combined_dataframe.drop_duplicates()


# In[80]:


dedup_frame.shape


# In[81]:


print([(x,y) for x, y in zip([x.shape[1] for x in dataframes], glob.glob('Microsoft_corpus_*')) if x != 2])


# In[85]:


dedup_frame.to_csv('Combined_excel.csv')


# In[87]:


print("Total Dupes:", combined_dataframe.shape[0] - dedup_frame.shape[0])

