ITPub博客

首页 > 大数据 > Hadoop > 在HDFS上使用Lucene的SourceCode

在HDFS上使用Lucene的SourceCode

原创 Hadoop 作者:大秦here 时间:2014-04-12 14:41:16 0 删除 编辑
//非HDFS上建立索引
// Creating IndexWriter object and specifying the path where Indexed
//files are to be stored.
IndexWriter indexWriter = new IndexWriter("E://DataFile/IndexFiles", new StandardAnalyzer(), true);
             
// Creating BufferReader object and specifying the path of the file
//whose data is required to be indexed.
BufferedReader reader= new BufferedReader(new FileReader("E://DataFile/Test.txt"));
             
String row=null;
         
// Reading each line present in the file.
while ((row=reader.readLine())!= null)
{
// Getting each field present in a row into an Array and file delimiter is "space separated"
String Arow[] = row.split(" ");
                 
// For each row, creating a document and adding data to the document with the associated fields.
org.apache.lucene.document.Document document = new org.apache.lucene.document.Document();
                 
document.add(new Field("date",Arow[0],Field.Store.YES,Field.Index.ANALYZED));
document.add(new Field("time",Arow[1],Field.Store.YES,Field.Index.ANALYZED));
document.add(newField ("cs-method",Arow[2],Field.Store.YES,Field.Index.ANALYZED));
document.add(newField ("cs-uri",Arow[3],Field.Store.YES,Field.Index.ANALYZED));
document.add(newField ("sc-status",Arow[4],Field.Store.YES,Field.Index.ANALYZED));
document.add(newField ("time-taken",Arow[5],Field.Store.YES,Field.Index.ANALYZED));
                 
// Adding document to the index file.
indexWriter.addDocument(document);
}        
indexWriter.optimize();
indexWriter.close();
reader.close();




//////////////////////////////////////////////////////////////////////////////////////////


//非HDFS上搜索关键词
// Creating Searcher object and specifying the path where Indexed files are stored.
Searcher searcher = new IndexSearcher("E://DataFile/IndexFiles");
Analyzer analyzer = new StandardAnalyzer();


// Printing the total number of documents or entries present in the index file.
System.out.println("Total Documents = "+searcher.maxDoc()) ;
            
// Creating the QueryParser object and specifying the field name on 
//which search has to be done.
QueryParser parser = new QueryParser("cs-uri", analyzer);
            
// Creating the Query object and specifying the text for which search has to be done.
Query query = parser.parse("/blank");
            
// Below line performs the search on the index file and
Hits hits = searcher.search(query);
            
// Printing the number of documents or entries that match the search query.
System.out.println("Number of matching documents = "+ hits.length());


// Printing documents (or rows of file) that matched the search criteria.
for (int i = 0; i < hits.length(); i++)
{
    Document doc = hits.doc(i);
    System.out.println(doc.get("date")+" "+ doc.get("time")+ " "+
    doc.get("cs-method")+ " "+ doc.get("cs-uri")+ " "+ doc.get("sc-status")+ " "+ doc.get("time-taken"));
}




///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////




//以HDFS上文件为数据源,建立索引,放到RAMDirectory中
// Path where the index files will be stored. 
String Index_DIR="/IndexFiles/"; 


// Path where the data file is stored. 
String File_DIR="/DataFile/test.txt"; 


// Creating FileSystem object, to be able to work with HDFS 
Configuration config = new Configuration(); 
config.set("fs.default.name","hdfs://127.0.0.1:9000/"); 
FileSystem dfs = FileSystem.get(config); 


// Creating a RAMDirectory (memory) object, to be able to create index in memory. 
RAMDirectory rdir = new RAMDirectory();   


// Creating IndexWriter object for the Ram Directory 
IndexWriter indexWriter = new IndexWriter (rdir, new StandardAnalyzer(), true);    
         
// Creating FSDataInputStream object, for reading the data from "Test.txt" file residing on HDFS. 
FSDataInputStream filereader = dfs.open(new Path(dfs.getWorkingDirectory()+ File_DIR)); 
String row=null;           


// Reading each line present in the file. 
while ((row=reader.readLine())!=null) {   


// Getting each field present in a row into an Array and file 
//delimiter is "space separated". 
String Arow[]=row.split(" ");                   


// For each row, creating a document and adding data to the document  
//with the associated fields. org.apache.lucene.document.Document document = new org.apache.lucene.document.Document();      
document.add(new Field("date",Arow[0],Field.Store.YES,Field.Index.ANALYZED)); 
document.add(new Field("time",Arow[1],Field.Store.YES,Field.Index.ANALYZED)); 
document.add(new Field ("cs-method",Arow[2],Field.Store.YES,Field.Index.ANALYZED)); 
document.add(new Field ("cs-uri",Arow[3],Field.Store.YES,Field.Index.ANALYZED)); 
document.add(new Field ("sc-status",Arow[4],Field.Store.YES,Field.Index.ANALYZED)); 
document.add(new Field ("time-taken",Arow[5],Field.Store.YES,Field.Index.ANALYZED));  
                 
// Adding document to the index file. 
indexWriter.addDocument(document); 
}           
indexWriter.optimize(); 
indexWriter.close(); 
reader.close(); 




////////////////////////////////////////////////////////////////////////


//将索引结果放回到HDFS上
// Getting files present in memory into an array. 
String fileList[]=rdir.list();   


// Reading index files from memory and storing them to HDFS. 
for (int i = 0; I < fileList.length; i++) {     
IndexInput indxfile = rdir.openInput(fileList[i].trim());     
long len = indxfile.length();     
int len1 = (int) len;       


// Reading data from file into a byte array.     
byte[] bytarr = new byte[len1];     
indxfile.readBytes(bytarr, 0, len1);      
         
// Creating file in HDFS directory with name same as that of      
//index file     
Path src = new Path(dfs.getWorkingDirectory()+Index_DIR+ fileList[i].trim());     
dfs.createNewFile(src);    
   
// Writing data from byte array to the file in HDFS 
FSDataOutputStream fs = dfs.create(new Path(dfs.getWorkingDirectory()+Index_DIR+fileList[i].trim()),true);     
fs.write(bytarr);     
fs.close(); 

dfs.closeAll(); 




/////////////////////////////////////////////////////////////////////////


//将HDFS上索引文件中索引信息读出来放到RAMDirectory中
// Creating FileSystem object, to be able to work with HDFS 
Configuration config = new Configuration(); 
config.set("fs.default.name","hdfs://127.0.0.1:9000/"); 
FileSystem dfs = FileSystem.get(config);   


// Creating a RAMDirectory (memory) object, to be able to create index in memory. 
RAMDirectory rdir = new RAMDirectory();      
         
// Getting the list of index files present in the directory into an array. 
Path pth = new Path(dfs.getWorkingDirectory()+Index_DIR); 
FileSystemDirectory fsdir = new FileSystemDirectory(dfs,pth,false,config); 
String filelst[] = fsdir.list(); 
FSDataInputStream filereader = null; 
for (int i = 0; i // Reading data from index files on HDFS directory into filereader object. 
filereader = dfs.open(new Path(dfs.getWorkingDirectory()+Index_DIR+filelst[i]));                  
int size = filereader.available();     
  
// Reading data from file into a byte array.                 
byte[] bytarr = new byte[size];     
filereader.read(bytarr, 0, size);      
 
// Creating file in RAM directory with names same as that of  
//index files present in HDFS directory.     
IndexOutput indxout = rdir.createOutput(filelst[i]);       


// Writing data from byte array to the file in RAM directory     
indxout.writeBytes(bytarr,bytarr.length);     
indxout.flush();             
indxout.close();                 

filereader.close(); 






/////////////////////////////////////////////////////////////////////////////




//在建好的索引上搜索
Searcher searcher = new IndexSearcher(rdir); 
Analyzer analyzer = new StandardAnalyzer();   
System.out.println("Total Documents = "+searcher.maxDoc()) ;               
QueryParser parser = new QueryParser("time", analyzer);               
Query query = parser.parse("02\\:24\\:04");               
Hits hits = searcher.search(query);               
System.out.println("Number of matching documents = "+ hits.length());   
for (int i = 0; i < hits.length(); i++) { 
Document doc = hits.doc(i); 
System.out.println(doc.get("date")+" "+ doc.get("time")+ " "+ doc.get("cs-method")+ " "+ doc.get("cs-uri")+ " "+ doc.get("sc-status")+ " "+ doc.get("time-taken")); 


来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/29603416/viewspace-1140727/,如需转载,请注明出处,否则将追究法律责任。

上一篇: 老版本lucene例子
下一篇: ICTCLAS小例子
请登录后发表评论 登录
全部评论

注册时间:2014-04-12

  • 博文量
    3
  • 访问量
    6632