In my last blog post I had discussed about using Map/Reduce to find co-authors in PubMed data on a HDP for windows. In this blog I will explain how to extract keywords from PubMed Abstracts. I am going to use the API provided by BjutCS on codeproject. The API basically extracts keywords based on entropy difference. For more details you can check this article on codeproject.
I have downloaded the PubMed data using NCBI Entrez Utilities Web Service. I am going to use only three information such as ID, Title and Abstract for extracting Keywords, so I stored these information in a tab delimited text file. Below is the screen shot of the file opened in MS Excel.
Before going to write the code you have to import the file to the hdfs and install the Microsoft .NET Map Reduce API for Hadoop for Visual Studio. You can check my last blog post in order to setting up the development environment.
Mapper
In mapping we have to read each data from the file. We have to extract keywords from the abstract text, so we have to properly format it. I have used a class named KeyWordExtractorUtility which is a helper class. The method StandardiseText standardizes the text by removing punctuations and another method called RemoveStopWords removes the stop words from the text. The stop words are stored in a file. You can add new stop words to extract more accurate keywords. Then we are going to emit the key value pair for each word in the abstract, where the key will be (id + word + total word count of the abstract) and the value will be the position of the word in the abstract. The id and the total word count in the key will be needed in the reducing part.
Below is the code
public class WordsMap : MapperBase{
public override void Map(string inputLine, MapperContext context)
{
string[] pubmedData = inputLine.Split(‘\t’);
string id = pubmedData[0];
string title = pubmedData[1];
string abstractText = pubmedData[2];
string text = title+” “+abstractText;
text = KeyWordExtractorUtility.StandardiseText(text);
string[] words = KeyWordExtractorUtility.RemoveStopWords(text);
for(int i=0;i<words.Count();i++)
{
context.EmitKeyValue(id+”,”+ words[i] + “,” + words.Count(), i.ToString());
}
}
}
Combiner/Intermediate Reducer
Here we are going to use the collected position values for a word to calculate the entropy difference. The method CheckEntropy is a wrapper to the KeywordExtractionAPI, which initiates and assigns the required values to calculate the entropy difference. It returns true if the entropy difference for the word is non negative. Here we emit the id as the key and the selected word as the value.
public class KeyWordReducer1 : ReducerCombinerBase{
public override void Reduce(string key, IEnumerable<string> values, ReducerCombinerContext context)
{
string[] keyVals = key.Split(‘,’);
string id = keyVals[0];
string word = keyVals[1];
int totalWords = Convert.ToInt32(keyVals[2]);
bool isKeyword = KeyWordExtractorUtility.CheckEntropy(values.ToArray(), totalWords);
if (isKeyword)
{
context.EmitKeyValue(id, word);
}
}
}
Reducer
Here we collect the words for each unique id. I have used the Id to create hyperlinks to navigate the published PubMed onhttp://www.ncbi.nlm.nih.gov/pubmed. Finally the URL and the keywords are emitted.
public class KeyWordReducer2 : ReducerCombinerBase{
public override void Reduce(string key, IEnumerable<string> values, ReducerCombinerContext context)
{
context.EmitKeyValue(“http://www.ncbi.nlm.nih.gov/pubmed/”+key, values.Aggregate((x,y)=>x+”, “+y));
}
}
Job Definition
Below is the code for the job. Set the input path according to your data in hdfs.
public class FindKeywordsJob : HadoopJob<WordsMap,KeyWordReducer1,KeyWordReducer2>{
public override HadoopJobConfiguration Configure(ExecutorContext context)
{
var config = new HadoopJobConfiguration();
config.InputPath = “input/kw”;
config.OutputFolder = “output/kw”;
return config;
}
}
Run the Job
Below is the code to run the job.
static void Main(string[] args){
var hadoop = Hadoop.Connect();
var result = hadoop.MapReduceJob.ExecuteJob<FindKeywordsJob>();
Console.ReadKey();
}
If the Map/Reduce job runs successfully the out put file can be seen like below in MS Excel
Below is the complete code
using System;using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Xml;
using Microsoft.Hadoop.MapReduce;
namespace keyword
{
class Program
{
public class WordsMap : MapperBase
{
public override void Map(string inputLine, MapperContext context)
{
string[] pubmedData = inputLine.Split(‘\t’);
string id = pubmedData[0];
string title = pubmedData[1];
string abstractText = pubmedData[2];
string text = title+” “+abstractText;
text = KeyWordExtractorUtility.StandardiseText(text);
string[] words = KeyWordExtractorUtility.RemoveStopWords(text);
for(int i=0;i<words.Count();i++)
{
context.EmitKeyValue(id+”,”+words[i] + “,” + words.Count(), i.ToString());
}
}
}
public class KeyWordReducer1 : ReducerCombinerBase
{
public override void Reduce(string key, IEnumerable<string> values, ReducerCombinerContext context)
{
string[] keyVals = key.Split(‘,’);
string id = keyVals[0];
string word = keyVals[1];
int totalWords = Convert.ToInt32(keyVals[2]);
bool isKeyword = KeyWordExtractorUtility.CheckEntropy(values.ToArray(), totalWords);
if (isKeyword)
{
context.EmitKeyValue(id, word);
}
}
}
public class KeyWordReducer2 : ReducerCombinerBase
{
public override void Reduce(string key, IEnumerable<string> values, ReducerCombinerContext context)
{
context.EmitKeyValue(“http://www.ncbi.nlm.nih.gov/pubmed/”+key, values.Aggregate((x,y)=>x+”, “+y));
}
}
public class FindKeywordsJob : HadoopJob<WordsMap,KeyWordReducer1,KeyWordReducer2>
{
public override HadoopJobConfiguration Configure(ExecutorContext context)
{
var config = new HadoopJobConfiguration();
config.InputPath = “input/kw”;
config.OutputFolder = “output/kw”;
return config;
}
}
static void Main(string[] args)
{
var hadoop = Hadoop.Connect();
var result = hadoop.MapReduceJob.ExecuteJob<FindKeywordsJob>();
Console.ReadKey();
}
}
}
KeyWordExtractorUtility
using System;using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
namespace keyword
{
public class KeyWordExtractorUtility
{
static string path = @”C:\temp\stoplist.txt”; //Your stop list file path
static string[] stopWordArray;
static KeyWordExtractorUtility()
{
Encoding encoding = Encoding.GetEncoding(“GB2312”);
stopWordArray = File.ReadAllLines(path, encoding);
}
public static string StandardiseText(string input)
{
Regex regex = new Regex(“\\s{2,}”, RegexOptions.IgnoreCase);
input = Regex.Replace(input, “\n”, ” “);
input = regex.Replace(input, ” “).Trim();
input = Regex.Replace(input, “[,:;!()(),.:\””;&#?!]”, ” “);
input = regex.Replace(input, ” “).Trim();
input = input.ToLower();
return input;
}
public static string[] RemoveStopWords(string TheDoc)
{
string[] wordArray = TheDoc.Split(new char[] { ‘ ‘ });
List<string> resultWords = new List<string>();
string[] result;
for (int i = 0; i < wordArray.Length; i++)
{
if (!stopWordArray.Contains(wordArray[i]))
{
resultWords.Add(wordArray[i]);
}
}
result = resultWords.ToArray();
return result;
}
public static bool CheckEntropy(string[] positions, int totalWords)
{
int frequency = positions.Count();
int[] positionsArray = new int[frequency];
int[] distanceArray = new int[frequency];
for (int i = 0; i < frequency; i++)
{
positionsArray[i] = Convert.ToInt32(positions.ElementAt(i));
}
for (int k = 0; k < frequency; k++)
{
if (k == 0)
{
distanceArray[k] = (positionsArray[k] + totalWords) – positionsArray[frequency – 1];
}
else
{
distanceArray[k] = positionsArray[k] – positionsArray[k – 1];
}
}
KeywordExtractionAPI.WORDSFRE keyWordExtract = new KeywordExtractionAPI.WORDSFRE();
keyWordExtract.Distance = distanceArray;
keyWordExtract.Position = positionsArray;
keyWordExtract.Frequency = frequency;
if (keyWordExtract.EntropyDifference_Normal())
{
if (keyWordExtract.ED > 0)
{
return true;
}
}
return false;
}
}
}
Note: You have to use the KeywordExtractionAPI.dll from codeproject. Create a text file stoplist.txt and add stop words separated by new line. Use the file in the code to remove stop words.