topic_mining.bib

@article{Xu2015166,
  title = {Implicit feature identification in Chinese reviews using explicit topic mining model },
  journal = {Knowledge-Based Systems },
  volume = {76},
  number = {0},
  pages = {166 - 175},
  year = {2015},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2014.12.012},
  url = {http://www.sciencedirect.com/science/article/pii/S095070511400450X},
  author = {Hua Xu and Fan Zhang and Wei Wang},
  keywords = {Opinion mining},
  keywords = {Implicit feature},
  keywords = {Topic model},
  keywords = {Support vector machine},
  keywords = {Product review },
  abstract = {Abstract The essential work of feature-specific opinion mining is centered on the product features. Previous related research work has often taken into account explicit features but ignored implicit features, However, implicit feature identification, which can help us better understand the reviews, is an essential aspect of feature-specific opinion mining. This paper is mainly centered on implicit feature identification in Chinese product reviews. We think that based on the explicit synonymous feature group and the sentences which contain explicit features, several Support Vector Machine (SVM) classifiers can be established to classify the non-explicit sentences. Nevertheless, instead of simply using traditional feature selection methods, we believe an explicit topic model in which each topic is pre-defined could perform better. In this paper, we first extend a popular topic modeling method, called Latent Dirichlet Allocation (LDA), to construct an explicit topic model. Then some types of prior knowledge, such as: must-links, cannot-links and relevance-based prior knowledge, are extracted and incorporated into the explicit topic model automatically. Experiments show that the explicit topic model, which incorporates pre-existing knowledge, outperforms traditional feature selection methods and other existing methods by a large margin and the identification task can be completed better. }
}
@article{Sun20151,
  title = {MSR4SM: Using topic models to effectively mining software repositories for software maintenance tasks },
  journal = {Information and Software Technology },
  volume = {66},
  number = {0},
  pages = {1 - 12},
  year = {2015},
  note = {},
  issn = {0950-5849},
  doi = {http://dx.doi.org/10.1016/j.infsof.2015.05.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0950584915001007},
  author = {Xiaobing Sun and Bixin Li and Hareton Leung and Bin Li and Yun Li},
  keywords = {Software maintenance},
  keywords = {Mining software historical repositories},
  keywords = {Topic model},
  keywords = {Empirical study },
  abstract = {AbstractContext Mining software repositories has emerged as a research direction over the past decade, achieving substantial success in both research and practice to support various software maintenance tasks. Software repositories include bug repository, communication archives, source control repository, etc. When using these repositories to support software maintenance, inclusion of irrelevant information in each repository can lead to decreased effectiveness or even wrong results. Objective This article aims at selecting the relevant information from each of the repositories to improve effectiveness of software maintenance tasks. Method For a maintenance task at hand, maintainers need to implement the maintenance request on the current system. In this article, we propose an approach, MSR4SM, to extract the relevant information from each software repository based on the maintenance request and the current system. That is, if the information in a software repository is relevant to either the maintenance request or the current system, this information should be included to perform the current maintenance task. \{MSR4SM\} uses the topic model to extract the topics from these software repositories. Then, relevant information in each software repository is extracted based on the topics. Results \{MSR4SM\} is evaluated for two software maintenance tasks, feature location and change impact analysis, which are based on four subject systems, namely jEdit, ArgoUML, Rhino and KOffice. The empirical results show that the effectiveness of traditional software repositories based maintenance tasks can be greatly improved by MSR4SM. Conclusions There is a lot of irrelevant information in software repositories. Before we use them to implement a maintenance task at hand, we need to preprocess them. Then, the effectiveness of the software maintenance tasks can be improved. }
}
@article{Guo20122008,
  title = {Mining Hot Topics from Twitter Streams },
  journal = {Procedia Computer Science },
  volume = {9},
  number = {0},
  pages = {2008 - 2011},
  year = {2012},
  note = {Proceedings of the International Conference on Computational Science, \{ICCS\} 2012 },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2012.04.224},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050912003456},
  author = {Jing Guo and Peng Zhang and JianlongTan and Li Guo},
  keywords = {Data stream mining},
  keywords = {Hot topic mining},
  keywords = {Frequent pattern mining},
  keywords = {Twitter streams },
  abstract = {Mininghottopicsfrom twitter streamshas attractedalotof attentionin recent years.Traditionalhottopicmining from InternetWeb pages were mainly basedontext clustering.However, comparedtothetextsinWeb pages, twitter texts are relatively short with sparse attributes. Moreover,twitter data often increase rapidly withfast spreading speed, whichposesgreat challengetoexistingtopicmining models.Tothisend,we propose,inthispaper, aflexible stream mining approach for hot twitter topic detection. Specifically, we propose to use the FrequentPattern stream mining algorithm (i.e. FP-stream) to detect hot topics from twitter streams. Empirical studies on real world twitter data demonstrate the utility of the proposed method. }
}
@article{Rao201490,
  title = {Sentiment topic models for social emotion mining },
  journal = {Information Sciences },
  volume = {266},
  number = {0},
  pages = {90 - 100},
  year = {2014},
  note = {},
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2013.12.059},
  url = {http://www.sciencedirect.com/science/article/pii/S002002551400019X},
  author = {Yanghui Rao and Qing Li and Xudong Mao and Liu Wenyin},
  keywords = {Social emotion mining},
  keywords = {Sentiment topic model},
  keywords = {Social emotion classification},
  keywords = {Social emotion lexicon },
  abstract = {Abstract The rapid development of social media services has facilitated the communication of opinions through online news, blogs, microblogs/tweets, instant-messages, and so forth. This article concentrates on the mining of readers’ emotions evoked by social media materials. Compared to the classical sentiment analysis from writers’ perspective, sentiment analysis of readers is sometimes more meaningful in social media. We propose two sentiment topic models to associate latent topics with evoked emotions of readers. The first model which is an extension of the existing Supervised Topic Model, generates a set of topics from words firstly, followed by sampling emotions from each topic. The second model generates topics from social emotions directly. Both models can be applied to social emotion classification and generate social emotion lexicons. Evaluation on social emotion classification verifies the effectiveness of the proposed models. The generated social emotion lexicon samples further show that our models can discover meaningful latent topics exhibiting emotion focus. }
}
@incollection{Marwick201463,
  title = {Chapter 3 - Discovery of Emergent Issues and Controversies in Anthropology Using Text Mining, Topic Modeling, and Social Network Analysis of Microblog Content },
  editor = {Cen, Yanchang ZhaoYonghua },
  booktitle = {Data Mining Applications with R },
  publisher = {Academic Press},
  edition = {},
  address = {Boston},
  year = {2014},
  pages = {63 - 93},
  isbn = {978-0-12-411511-8},
  doi = {http://dx.doi.org/10.1016/B978-0-12-411511-8.00003-7},
  url = {http://www.sciencedirect.com/science/article/pii/B9780124115118000037},
  author = {Ben Marwick},
  keywords = {Twitter},
  keywords = {Text mining},
  keywords = {Topic modeling},
  keywords = {Sentiment analysis},
  keywords = {Social network analysis},
  keywords = {Anthropology },
  abstract = {Abstract R is a convenient tool for analyzing text content to discover emergent issues and controversies in diverse corpora. In this case study, I investigate the use of Twitter at a major conference of professional and academic anthropologists. Using R I identify the demographics of the community, the structure of the community of Twitter-using anthropologists, and the topics that dominate the Twitter messages. I describe a series of statistical methods for handling a large corpus of Twitter messages that might otherwise be impractical to analyze. A key finding is that the transformative effect of Twitter in academia is to easily enable the spontaneous formation of information-sharing communities bound by an interest in an event or topic. }
}
@article{Wang201393,
  title = {Unsupervised mining of long time series based on latent topic model },
  journal = {Neurocomputing },
  volume = {103},
  number = {0},
  pages = {93 - 103},
  year = {2013},
  note = {},
  issn = {0925-2312},
  doi = {http://dx.doi.org/10.1016/j.neucom.2012.09.008},
  url = {http://www.sciencedirect.com/science/article/pii/S0925231212007527},
  author = {Jin Wang and Xiangping Sun and Mary F.H. She and Abbas Kouzani and Saeid Nahavandi},
  keywords = {\{ECG\} signals},
  keywords = {Bag-of-patterns},
  keywords = {Unsupervised learning},
  keywords = {pLSA},
  keywords = {\{LDA\} },
  abstract = {This paper presents a novel unsupervised method for mining time series based on two generative topic models, i.e., probabilistic Latent Semantic Analysis (pLSA) and Latent Dirichlet Allocation (LDA). The proposed method treats each time series as a text document, and extracts a set of local patterns from the sequence as words by sliding a short temporal window along the sequence. Motivated by the success of latent topic models in text document analysis, latent topic models are extended to find the underlying structure of time series in an unsupervised manner. The clusters or categories of unlabeled time series are automatically discovered by the latent topic models using bag-of-patterns representation. The proposed method was experimentally validated using two sets of time series data extracted from a public Electrocardiography (ECG) database through comparison with the baseline k-means and the Normalized Cuts approaches. In addition, the impact of the bag-of-patterns' parameters was investigated. Experimental results demonstrate that the proposed unsupervised method not only outperforms the baseline k-means and the Normalized Cuts in learning semantic categories of the unlabeled time series, but also is relatively stable with respect to the bag-of-patterns' parameters. To the best of our knowledge, this work is the first attempt to explore latent topic models for unsupervised mining of time series data. }
}
@article{Tsai20115330,
  title = {A tag-topic model for blog mining },
  journal = {Expert Systems with Applications },
  volume = {38},
  number = {5},
  pages = {5330 - 5335},
  year = {2011},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2010.10.025},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417410011693},
  author = {Flora S. Tsai},
  keywords = {Blog mining},
  keywords = {Weblog},
  keywords = {Tags},
  keywords = {Author-Topic model},
  keywords = {Isomap},
  keywords = {Latent Dirichlet Allocation },
  abstract = {Blog mining addresses the problem of mining information from blog data. Although mining blogs may share many similarities to Web and text documents, existing techniques need to be reevaluated and adapted for the multidimensional representation of blog data, which exhibit dimensions not present in traditional documents, such as tags. Blog tags are semantic annotations in blogs which can be valuable sources of additional labels for the myriad of blog documents. In this paper, we present a tag-topic model for blog mining, which is based on the Author-Topic model and Latent Dirichlet Allocation. The tag-topic model determines the most likely tags and words for a given topic in a collection of blog posts. The model has been successfully implemented and evaluated on real-world blog data. }
}
@article{Özyurt20108705,
  title = {Chat mining: Automatically determination of chat conversations’ topic in Turkish text based chat mediums },
  journal = {Expert Systems with Applications },
  volume = {37},
  number = {12},
  pages = {8705 - 8710},
  year = {2010},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2010.06.053},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417410005579},
  author = {Özcan Özyurt and Cemal Köse},
  keywords = {Chat mining},
  keywords = {Topic detection},
  keywords = {Chat conversations},
  keywords = {Feature selection},
  keywords = {Text classification },
  abstract = {Mostly, the conversations taking place in chat mediums bear important information concerning the speakers. This information can vary in many fields such as tendencies, habits, attitudes, guilt situations, and intentions of the speakers. Therefore, analysis and processing of these conversations are of much importance. Many social and semantic inferences can be made from these conversations. In determining characteristics of conversations and analysis of conversations, subject designation can be grounded on. In this study, chat mining is chosen as an application of text mining, and a study concerning determination of subject in the Turkish text based chat conversations is conducted. In sorting the conversations, supervised learning methods are used in this study. As for classifiers, Naive Bayes, k-Nearest Neighbor and Support Vector Machine are used. Ninety-one percent success is achieved in determination of subject. }
}
@article{Yang2011633,
  title = {Managing and mining multilingual documents: Introduction to the special topic issue of information processing management },
  journal = {Information Processing & Management },
  volume = {47},
  number = {5},
  pages = {633 - 634},
  year = {2011},
  note = {Managing and Mining Multilingual Documents },
  issn = {0306-4573},
  doi = {http://dx.doi.org/10.1016/j.ipm.2010.02.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0306457310000166},
  author = {Christopher C. Yang and Chih-Ping Wei and Lee-Feng Chien}
}
@article{Zeng20103202,
  title = {Multi-grain hierarchical topic extraction algorithm for text mining },
  journal = {Expert Systems with Applications },
  volume = {37},
  number = {4},
  pages = {3202 - 3208},
  year = {2010},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2009.09.061},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417409008422},
  author = {Jianping Zeng and Chengrong Wu and Wei Wang},
  keywords = {Hierarchical topic},
  keywords = {Topic grain},
  keywords = {Feature selection},
  keywords = {Text mining },
  abstract = {Topic extraction from text corpus is the fundamental of many topic analysis tasks, such as topic trend prediction, opinion extraction. Since hierarchical structure is characteristics of topics, it is preferential for a topic extraction algorithm to output the topics description with this kind of structure. However, the hierarchical topic structure that is extracted by most of the current topic analysis algorithms cannot provide a meaningful description for all subtopics in the hierarchical tree. Here, we propose a new hierarchical topic extraction algorithm based on topic grain computation. By considering the distribution of word document frequency as a mixture Gaussian, an EM-like algorithm is employed to achieve the best number of mixture components, and the mean value of each component. Then topic grain is defined based on the mixture Gaussian parameters, and feature words are selected for the grain. A clustering algorithm is employed to the converted text set based on the feature words. After repeatedly applying the clustering algorithm to different converted text set, a multi-grain hierarchical topic structure with different subtopic feature words description is extracted. Experiments on two real world datasets which are collected from a news website show that the proposed algorithm can generate more meaningful multi-grain topic structure, by comparing with the current hierarchical topic clustering algorithms. }
}
@article{PonsPorrata2007752,
  title = {Topic discovery based on text mining techniques },
  journal = {Information Processing & Management },
  volume = {43},
  number = {3},
  pages = {752 - 768},
  year = {2007},
  note = {Special Issue on Heterogeneous and Distributed \{IR\} },
  issn = {0306-4573},
  doi = {http://dx.doi.org/10.1016/j.ipm.2006.06.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0306457306000914},
  author = {Aurora Pons-Porrata and Rafael Berlanga-Llavori and José Ruiz-Shulcloper},
  keywords = {Hierarchical clustering},
  keywords = {Text summarization},
  keywords = {Topic detection },
  abstract = {In this paper, we present a topic discovery system aimed to reveal the implicit knowledge present in news streams. This knowledge is expressed as a hierarchy of topic/subtopics, where each topic contains the set of documents that are related to it and a summary extracted from these documents. Summaries so built are useful to browse and select topics of interest from the generated hierarchies. Our proposal consists of a new incremental hierarchical clustering algorithm, which combines both partitional and agglomerative approaches, taking the main benefits from them. Finally, a new summarization method based on Testor Theory has been proposed to build the topic summaries. Experimental results in the \{TDT2\} collection demonstrate its usefulness and effectiveness not only as a topic detection system, but also as a classification and summarization tool. }
}
@article{Zeng20126541,
  title = {Topics modeling based on selective Zipf distribution },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {7},
  pages = {6541 - 6546},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2011.12.051},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417411017222},
  author = {Jianping Zeng and Jiangjiao Duan and Wenjun Cao and Chengrong Wu},
  keywords = {Selective Zipf distribution},
  keywords = {Well-form topics},
  keywords = {Dirichlet prior},
  keywords = {Topic model},
  keywords = {Topic mining },
  abstract = {Automatically mining topics out of text corpus becomes an important fundament of many topic analysis tasks, such as opinion recognition, Web content classification, etc. Although large amount of topic models and topic mining methods have been proposed for different purposes and shown success in dealing with topic analysis tasks, it is desired to create accurate models or mining algorithms for many applications. A general criteria based on Zipf fitness quantity computation is proposed to determine whether a topic description is well-form or not. Based on the quantity definition, the popular Dirichlet prior on multinomial parameters is found that it cannot always produce well-form topic descriptions. Hence, topics modeling based on \{LDA\} with selective Zipf documents as training dataset is proposed to improve the quality in generation of topics description. Experiments on two standard text corpuses, i.e. \{AP\} dataset and Reuters-21578, show that the modeling method based on selective Zipf distribution can achieve better perplexity, which means better ability in predicting topics. While a test of topics extraction on a collection of news documents about recent financial crisis shows that the description key words in topics are more meaningful and reasonable than that of tradition topic mining method. }
}
@article{Li2012237,
  title = {Adding community and dynamic to topic models },
  journal = {Journal of Informetrics },
  volume = {6},
  number = {2},
  pages = {237 - 253},
  year = {2012},
  note = {},
  issn = {1751-1577},
  doi = {http://dx.doi.org/10.1016/j.joi.2011.11.004},
  url = {http://www.sciencedirect.com/science/article/pii/S1751157711001039},
  author = {Daifeng Li and Ying Ding and Xin Shuai and Johan Bollen and Jie Tang and Shanshan Chen and Jiayi Zhu and Guilherme Rocha},
  keywords = {Social network},
  keywords = {Semantic community},
  keywords = {Topic mining},
  keywords = {Dynamic },
  abstract = {The detection of communities in large social networks is receiving increasing attention in a variety of research areas. Most existing community detection approaches focus on the topology of social connections (e.g., coauthor, citation, and social conversation) without considering their topic and dynamic features. In this paper, we propose two models to detect communities by considering both topic and dynamic features. First, the Community Topic Model (CTM) can identify communities sharing similar topics. Second, the Dynamic \{CTM\} (DCTM) can capture the dynamic features of communities and topics based on the Bernoulli distribution that leverages the temporal continuity between consecutive timestamps. Both models were tested on two datasets: ArnetMiner and Twitter. Experiments show that communities with similar topics can be detected and the co-evolution of communities and topics can be observed by these two models, which allow us to better understand the dynamic features of social networks and make improved personalized recommendations. }
}
@article{Yang2015251,
  title = {Incorporating self-organizing map with text mining techniques for text hierarchy generation },
  journal = {Applied Soft Computing },
  volume = {34},
  number = {0},
  pages = {251 - 259},
  year = {2015},
  note = {},
  issn = {1568-4946},
  doi = {http://dx.doi.org/10.1016/j.asoc.2015.05.005},
  url = {http://www.sciencedirect.com/science/article/pii/S1568494615003051},
  author = {Hsin-Chang Yang and Chung-Hong Lee and Han-Wei Hsiao},
  keywords = {Text mining},
  keywords = {Self-organizing map},
  keywords = {Topic identification},
  keywords = {Hierarchy generation },
  abstract = {Abstract Self-organizing maps (SOM) have been applied on numerous data clustering and visualization tasks and received much attention on their success. One major shortage of classical \{SOM\} learning algorithm is the necessity of predefined map topology. Furthermore, hierarchical relationships among data are also difficult to be found. Several approaches have been devised to conquer these deficiencies. In this work, we propose a novel \{SOM\} learning algorithm which incorporates several text mining techniques in expanding the map both laterally and hierarchically. On training a set of text documents, the proposed algorithm will first cluster them using classical \{SOM\} algorithm. We then identify the topics of each cluster. These topics are then used to evaluate the criteria on expanding the map. The major characteristic of the proposed approach is to combine the learning process with text mining process and makes it suitable for automatic organization of text documents. We applied the algorithm on the Reuters-21578 dataset in text clustering and categorization tasks. Our method outperforms two comparing models in hierarchy quality according to users’ evaluation. It also receives better F1-scores than two other models in text categorization task. }
}
@article{Hadzic201597,
  title = {Ordered subtree mining via transactional mapping using a structure-preserving tree database schema },
  journal = {Information Sciences },
  volume = {310},
  number = {0},
  pages = {97 - 117},
  year = {2015},
  note = {},
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2015.03.015},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025515001802},
  author = {Fedja Hadzic and Michael Hecker and Andrea Tagarelli},
  keywords = {Frequent subtree mining},
  keywords = {Position-constrained subtree discovery},
  keywords = {Transactional representation},
  keywords = {Semistructured data },
  abstract = {Abstract Frequent subtree mining is a major research topic in knowledge discovery from tree-structured data, whose importance is witnessed by the pervasiveness of such data in several domains. In this paper, we present a novel approach to discover all the frequent ordered subtrees in a tree-structured database. A key aspect is that the structural aspects of the input tree instances are extracted to generate a transactional format that enables the application of standard itemset mining techniques. In this way, the expensive process of subtree enumeration is avoided, while subtrees can be reconstructed in a post-processing stage. As a result, more structurally complex tree data can be handled and much lower support thresholds can be used. In addition to discovering traditional subtrees, this is the first approach to frequent subtree mining that can discover position-constrained subtrees. Each node in the position-constrained subtree is annotated with its exact occurrence and level of embedding in the original database tree. Also, disconnected subtree associations can be represented via virtual connecting nodes. Experiments conducted on synthetic and real-world datasets confirm the expected advantages of our approach over competing methods in terms of efficiency, mining capabilities, and informativeness of the extracted patterns. }
}
@article{Zhang201567,
  title = {Mining summarization of high utility itemsets },
  journal = {Knowledge-Based Systems },
  volume = {84},
  number = {0},
  pages = {67 - 77},
  year = {2015},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2015.04.004},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705115001422},
  author = {Xiong Zhang and Zhi-Hong Deng},
  keywords = {Data mining},
  keywords = {High utility itemsets},
  keywords = {Utility mining},
  keywords = {Summarization },
  abstract = {Abstract Mining interesting itemsets from transaction databases has attracted a lot of research interests for decades. In recent years, high utility itemset (HUI) has emerged as a hot topic in this field. In real applications, the bottleneck of \{HUI\} mining is not at the efficiency but at the interpretability, due to the huge number of itemsets generated by the mining process. Because the downward closure property of itemsets no longer holds for HUIs, the compression or summarization methods for frequent itemsets are not available. With this in mind, considering coverage and diversity, we introduce a novel well-founded approach, called SUIT-miner, for succinctly summarizing \{HUIs\} with a small collection of itemsets. First, we define the condition under which an itemset can cover another itemset. Then, a greedy algorithm is presented to find the least itemsets to cover all of HUIs, in order to ensure diversity. For enhancing the efficiency, the greedy algorithm employs some pruning strategies. To evaluate the performance of SUIT-miner, we conduct extensive experiments on real datasets. The experimental results show that SUIT-miner is effective and efficient. }
}
@article{Petz2015510,
  title = {Reprint of: Computational approaches for mining user’s opinions on the Web 2.0 },
  journal = {Information Processing & Management },
  volume = {51},
  number = {4},
  pages = {510 - 519},
  year = {2015},
  note = {},
  issn = {0306-4573},
  doi = {http://dx.doi.org/10.1016/j.ipm.2014.07.011},
  url = {http://www.sciencedirect.com/science/article/pii/S0306457315000655},
  author = {Gerald Petz and Michał Karpowicz and Harald Fürschuß and Andreas Auinger and Václav Stříteský and Andreas Holzinger},
  keywords = {Opinion mining},
  keywords = {Noisy text},
  keywords = {Text preprocessing},
  keywords = {User generated content},
  keywords = {Data mining },
  abstract = {Abstract The emerging research area of opinion mining deals with computational methods in order to find, extract and systematically analyze people’s opinions, attitudes and emotions towards certain topics. While providing interesting market research information, the user generated content existing on the Web 2.0 presents numerous challenges regarding systematic analysis, the differences and unique characteristics of the various social media channels being one of them. This article reports on the determination of such particularities, and deduces their impact on text preprocessing and opinion mining algorithms. The effectiveness of different algorithms is evaluated in order to determine their applicability to the various social media channels. Our research shows that text preprocessing algorithms are mandatory for mining opinions on the Web 2.0 and that part of these algorithms are sensitive to errors and mistakes contained in the user generated content. }
}
@article{Balazs201695,
  title = {Opinion Mining and Information Fusion: A survey },
  journal = {Information Fusion },
  volume = {27},
  number = {0},
  pages = {95 - 110},
  year = {2016},
  note = {},
  issn = {1566-2535},
  doi = {http://dx.doi.org/10.1016/j.inffus.2015.06.002},
  url = {http://www.sciencedirect.com/science/article/pii/S1566253515000536},
  author = {Jorge A. Balazs and Juan D. Velásquez},
  keywords = {Information Fusion},
  keywords = {Survey},
  keywords = {Opinion Mining},
  keywords = {Sentiment Analysis },
  abstract = {Abstract Interest in Opinion Mining has been growing steadily in the last years, mainly because of its great number of applications and the scientific challenge it poses. Accordingly, the resources and techniques to help tackle the problem are many, and most of the latest work fuses them at some stage of the process. However, this combination is usually executed without following any defined guidelines and overlooking the possibility of replicating and improving it, hence the need for a deeper understanding of the fusion process becomes apparent. Information Fusion is the field charged with researching efficient methods for transforming information from different sources into a single coherent representation, and therefore can be used to guide fusion processes in Opinion Mining. In this paper we present a survey on Information Fusion applied to Opinion Mining. We first define Opinion Mining and describe its most fundamental aspects, later explain Information Fusion and finally review several Opinion Mining studies that rely at some point on the fusion of information. }
}
@article{Le20156648,
  title = {An N-list-based algorithm for mining frequent closed patterns },
  journal = {Expert Systems with Applications },
  volume = {42},
  number = {19},
  pages = {6648 - 6657},
  year = {2015},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2015.04.048},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417415002900},
  author = {Tuong Le and Bay Vo},
  keywords = {Data mining},
  keywords = {Frequent closed pattern},
  keywords = {N-list structure },
  abstract = {Abstract Frequent closed patterns (FCPs), a condensed representation of frequent patterns, have been proposed for the mining of (minimal) non-redundant association rules to improve performance in terms of memory usage and mining time. Recently, the N-list structure has been proven to be very efficient for mining frequent patterns. This study proposes an N-list-based algorithm for mining \{FCPs\} called NAFCP. Two theorems for fast determining \{FCPs\} based on the N-list structure are proposed. The N-list structure provides a much more compact representation compared to previously proposed vertical structures, reducing the memory usage and mining time required for mining FCPs. The experimental results show that \{NAFCP\} outperforms previous algorithms in terms of runtime and memory usage in most cases. }
}
@article{Lin201549,
  title = {A fast and resource efficient mining algorithm for discovering frequent patterns in distributed computing environments },
  journal = {Future Generation Computer Systems },
  volume = {52},
  number = {0},
  pages = {49 - 58},
  year = {2015},
  note = {},
  issn = {0167-739X},
  doi = {http://dx.doi.org/10.1016/j.future.2015.05.009},
  url = {http://www.sciencedirect.com/science/article/pii/S0167739X15001843},
  author = {Kawuu W. Lin and Sheng-Hao Chung},
  keywords = {Data mining},
  keywords = {Frequent pattern mining},
  keywords = {Distributed mining},
  keywords = {Parallel mining },
  abstract = {Abstract The advancement of electronic technology enables us to collect logs from various devices. Such logs require detailed analysis in order to be broadly useful. Data mining is a technique that has been widely used to extract hidden information from such data. Data mining is mainly composed of association rules mining, sequent pattern mining, classification and clustering. Association rules mining has attracted significant attention and been successfully applied to various fields. Although the past studies can effectively discover frequent patterns to deduce association rules, execution efficiency is still a critical problem. To speed up execution, many methods using parallel and distributed computing technology have been proposed in recent years. Most of the past studies focused on parallelizing the workload in a high end machine or in distributed computing environments like grid or cloud computing systems; however, very few of them discuss how to efficiently determine the appropriate number of computing nodes, considering execution efficiency and load balancing. An intuition is that execution speed is proportional to the number of computing nodes—that is, more the number of computing nodes, faster is the execution speed. However, this is incorrect for such algorithms because of the inherently algorithmic design. Allocating too many computing nodes can lead to high execution time. In addition to the execution inefficiency, inappropriate resource allocation is a waste of computing power and network bandwidth. At the same time, load cannot be effectively distributed if there are too few nodes allocated. In this paper, we propose a fast, load balancing and resource efficient algorithm named FLR-Mining for discovering frequent patterns in distributed computing systems. FLR-Mining is capable of determining the appropriate number of computing nodes automatically and achieving better load balancing as compared with existing methods. Through empirical evaluation, FLR-Mining is shown to deliver excellent performance in terms of execution efficiency and load balancing. }
}
@article{Senderovich2015278,
  title = {Queue mining for delay prediction in multi-class service processes },
  journal = {Information Systems },
  volume = {53},
  number = {0},
  pages = {278 - 295},
  year = {2015},
  note = {},
  issn = {0306-4379},
  doi = {http://dx.doi.org/10.1016/j.is.2015.03.010},
  url = {http://www.sciencedirect.com/science/article/pii/S0306437915000708},
  author = {Arik Senderovich and Matthias Weidlich and Avigdor Gal and Avishai Mandelbaum},
  keywords = {Delay prediction},
  keywords = {Process mining},
  keywords = {Queueing theory},
  keywords = {Queue mining },
  abstract = {Abstract Information systems have been widely adopted to support service processes in various domains, e.g., in the telecommunication, finance, and health sectors. Information recorded by systems during the operation of these processes provides an angle for operational process analysis, commonly referred to as process mining. In this work, we establish a queueing perspective in process mining to address the online delay prediction problem, which refers to the time that the execution of an activity for a running instance of a service process is delayed due to queueing effects. We present predictors that treat queues as first-class citizens and either enhance existing regression-based techniques for process mining or are directly grounded in queueing theory. In particular, our predictors target multi-class service processes, in which requests are classified by a type that influences their processing. Further, we introduce queue mining techniques that derive the predictors from event logs recorded by an information system during process execution. Our evaluation based on large real-world datasets, from the telecommunications and financial sectors, shows that our techniques yield accurate online predictions of case delay and drastically improve over predictors neglecting the queueing perspective. }
}
@article{Wen20156423,
  title = {Activity recognition with weighted frequent patterns mining in smart environments },
  journal = {Expert Systems with Applications },
  volume = {42},
  number = {17–18},
  pages = {6423 - 6432},
  year = {2015},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2015.04.020},
  url = {http://www.sciencedirect.com/science/article/pii/S095741741500250X},
  author = {Jiahui Wen and Mingyang Zhong and Zhiying Wang},
  keywords = {Data mining},
  keywords = {Association rule},
  keywords = {Activity recognition},
  keywords = {Global and local weight},
  keywords = {Smart environments },
  abstract = {Abstract In the past decades, activity recognition has aroused a great interest for the research groups majoring in context-awareness computing and human behaviours monitoring. However, the correlations between the activities and their frequent patterns have never been directly addressed by traditional activity recognition techniques. As a result, activities that trigger the same set of sensors are difficult to differentiate, even though they present different patterns such as different frequencies of the sensor events. In this paper, we propose an efficient association rule mining technique to find the association rules between the activities and their frequent patterns, and build an activity classifier based on these association rules. We also address the classification of overlapped activities by incorporating the global and local weight of the patterns. The experiment results using publicly available dataset demonstrate that our method is able to achieve better performance than traditional recognition methods such as Decision Tree, Naive Bayesian and HMM. Comparison studies show that the proposed association rule mining method is efficient, and we can further improve the activity recognition accuracy by considering global and local weight of frequent patterns of activities. }
}
@article{Deng20155424,
  title = {PrePost+: An efficient N-lists-based algorithm for mining frequent itemsets via Children–Parent Equivalence pruning },
  journal = {Expert Systems with Applications },
  volume = {42},
  number = {13},
  pages = {5424 - 5432},
  year = {2015},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2015.03.004},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417415001803},
  author = {Zhi-Hong Deng and Sheng-Long Lv},
  keywords = {Data mining},
  keywords = {Frequent itemset mining},
  keywords = {N-lists},
  keywords = {Pruning},
  keywords = {Algorithm },
  abstract = {Abstract N-list is a novel data structure proposed in recent years. It has been proven to be very efficient for mining frequent itemsets. In this paper, we present PrePost+, a high-performance algorithm for mining frequent itemsets. It employs N-list to represent itemsets and directly discovers frequent itemsets using a set-enumeration search tree. Especially, it employs an efficient pruning strategy named Children–Parent Equivalence pruning to greatly reduce the search space. We have conducted extensive experiments to evaluate PrePost+ against three state-of-the-art algorithms, which are PrePost, FIN, and FP-growth∗, on six various real datasets. The experimental results show that PrePost+ is always the fastest one on all datasets. Moreover, PrePost+ also demonstrates good performance in terms of memory consumption since it use only a litter more memory than FP-growth∗ and less memory than PrePost and FIN. }
}
@article{Költringer20151836,
  title = {Analyzing destination branding and image from online sources: A web content mining approach },
  journal = {Journal of Business Research },
  volume = {68},
  number = {9},
  pages = {1836 - 1843},
  year = {2015},
  note = {},
  issn = {0148-2963},
  doi = {http://dx.doi.org/10.1016/j.jbusres.2015.01.011},
  url = {http://www.sciencedirect.com/science/article/pii/S0148296315000259},
  author = {Clemens Költringer and Astrid Dickinger},
  keywords = {Place brand},
  keywords = {Destination image},
  keywords = {Media monitoring},
  keywords = {Text mining},
  keywords = {Content analysis},
  keywords = {Correspondence analysis },
  abstract = {Abstract Destination image, place brand, and branding continue to receive attention by researchers and industry. However, a thorough definition and differentiation of these terms and further investigation are still necessary. Digital information sources provide relevant image formation and branding agents and thus, potentially impact travelers' image and serve as platforms to communicate perceptions. With abundant online information on places available, the data offer insights into the brand identity communications and the image perceptions by travelers. This study presents an automated web content mining approach. A total set of 5719 documents inform the online destination representation in various online sources. Results demonstrate how to extract destination brand identity and image through web content mining. }
}
@article{Fu2015102,
  title = {Dynamic non-parametric joint sentiment topic mixture model },
  journal = {Knowledge-Based Systems },
  volume = {82},
  number = {0},
  pages = {102 - 114},
  year = {2015},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2015.02.021},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705115000751},
  author = {Xianghua Fu and Kun Yang and Joshua Zhexue Huang and Laizhong Cui},
  keywords = {Topic sentiment analysis},
  keywords = {Dynamic topic analysis},
  keywords = {Non-parametric topic model},
  keywords = {Social media},
  keywords = {Hierarchical Dirichlet Process},
  keywords = {Text mining },
  abstract = {Abstract The reviews in social media are produced continuously by a large and uncontrolled number of users. To capture the mixture of sentiment and topics simultaneously in reviews is still a challenging task. In this paper, we present a novel probabilistic model framework based on the non-parametric hierarchical Dirichlet process (HDP) topic model, called non-parametric joint sentiment topic mixture model (NJST), which adds a sentiment level to the \{HDP\} topic model and detects sentiment and topics simultaneously from reviews. Then considered the dynamic nature of social media data, we propose dynamic \{NJST\} (dNJST) which adds time decay dependencies of historical epochs to the current epochs. Compared with the existing sentiment topic mixture models which are based on latent Dirichlet allocation (LDA), the biggest difference of \{NJST\} and dNJST is that they can determine topic number automatically. We implement \{NJST\} and dNJST with online variational inference algorithms, and incorporate the sentiment priors of words into \{NJST\} and dNJST with HowNet lexicon. The experiment results in some Chinese social media dataset show that dNJST can effectively detect and track dynamic sentiment and topics. }
}
@article{Lei20152567,
  title = {Saliency-driven image classification method based on histogram mining and image score },
  journal = {Pattern Recognition },
  volume = {48},
  number = {8},
  pages = {2567 - 2580},
  year = {2015},
  note = {},
  issn = {0031-3203},
  doi = {http://dx.doi.org/10.1016/j.patcog.2015.02.004},
  url = {http://www.sciencedirect.com/science/article/pii/S0031320315000527},
  author = {Baiying Lei and Ee-Leng Tan and Siping Chen and Dong Ni and Tianfu Wang},
  keywords = {Image classification},
  keywords = {Bag of phrase},
  keywords = {Saliency map},
  keywords = {Histogram mining},
  keywords = {Image score },
  abstract = {Abstract Since most image classification tasks involve discriminative information (i.e., saliency), this paper proposes a new bag-of-phrase (BoP) approach to incorporate this information. Specifically, saliency map and local features are first extracted from edge-based dense descriptors. These features are represented by histogram and mined with discriminative learning technique. Image score calculated from the saliency map is also investigated to optimize a support vector machine (SVM) classifier. Both feature map and kernel trick methods are explored to enhance the accuracy of the \{SVM\} classifier. In addition, novel inter- and intra-class histogram normalization methods are investigated to further boost the performance of the proposed method. Experiments using several publicly available benchmark datasets demonstrate that the proposed method achieves promising classification accuracy and superior performance over state-of-the-art methods. }
}
@article{Khatib20157549,
  title = {Data mining for fuzzy diagnosis systems in \{LTE\} networks },
  journal = {Expert Systems with Applications },
  volume = {42},
  number = {21},
  pages = {7549 - 7559},
  year = {2015},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2015.05.031},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417415003590},
  author = {Emil J. Khatib and Raquel Barco and Ana Gómez-Andrades and Pablo Muñoz and Inmaculada Serrano},
  keywords = {Self-healing},
  keywords = {Self-Organizing Networks},
  keywords = {\{LTE\}},
  keywords = {Data mining},
  keywords = {Data driven learning},
  keywords = {Supervised learning},
  keywords = {Fault management},
  keywords = {Fuzzy systems},
  keywords = {Big Data },
  abstract = {Abstract The recent developments in cellular networks, along with the increase in services, users and the demand of high quality have raised the Operational Expenditure (OPEX). Self-Organizing Networks (SON) are the solution to reduce these costs. Within SON, self-healing is the functionality that aims to automatically solve problems in the radio access network, at the same time reducing the downtime and the impact on the user experience. Self-healing comprises four main functions: fault detection, root cause analysis, fault compensation and recovery. To perform the root cause analysis (also known as diagnosis), Knowledge-Based Systems (KBS) are commonly used, such as fuzzy logic. In this paper, a novel method for extracting the Knowledge Base for a \{KBS\} from solved troubleshooting cases is proposed. This method is based on data mining techniques as opposed to the manual techniques currently used. The data mining problem of extracting knowledge out of \{LTE\} troubleshooting information can be considered a Big Data problem. Therefore, the proposed method has been designed so it can be easily scaled up to process a large volume of data with relatively low resources, as opposed to other existing algorithms. Tests show the feasibility and good results obtained by the diagnosis system created by the proposed methodology in \{LTE\} networks. }
}
@article{Guo2015168,
  title = {Tech mining to generate indicators of future national technological competitiveness: Nano-Enhanced Drug Delivery (NEDD) in the \{US\} and China },
  journal = {Technological Forecasting and Social Change },
  volume = {97},
  number = {0},
  pages = {168 - 180},
  year = {2015},
  note = {},
  issn = {0040-1625},
  doi = {http://dx.doi.org/10.1016/j.techfore.2014.02.026},
  url = {http://www.sciencedirect.com/science/article/pii/S0040162514000900},
  author = {Ying Guo and Xiao Zhou and Alan L. Porter and Douglas K.R. Robinson},
  keywords = {Tech mining},
  keywords = {Nano-Enhanced Drug Delivery},
  keywords = {Comparative patterns},
  keywords = {Innovation pathways },
  abstract = {Abstract “Global technological competitiveness” is widely acknowledged, but the challenge is to go beyond this recognition to develop empirical indicators of important transitions. These may concern particular technologies, the competitive position of particular organizations, or national/regional shifts. For decades, the \{US\} has been the world leader in biomedical technologies, with attendant implications for organizational priorities in terms of R&D location and market targeting. Recent years have seen a tremendous acceleration in Asian research in most domains, including biomedical, particularly visible in China. This paper investigates comparative patterns between the \{US\} and China in a promising emerging area of biotechnology — Nano-Enhanced Drug Delivery. It then explores indicators of, and implications for, future transitions at the national level — an approach we label “Forecasting Innovation Pathways.” }
}
@article{No2015181,
  title = {A structured approach to explore knowledge flows through technology-based business methods by integrating patent citation analysis and text mining },
  journal = {Technological Forecasting and Social Change },
  volume = {97},
  number = {0},
  pages = {181 - 192},
  year = {2015},
  note = {},
  issn = {0040-1625},
  doi = {http://dx.doi.org/10.1016/j.techfore.2014.04.007},
  url = {http://www.sciencedirect.com/science/article/pii/S0040162514001280},
  author = {Hyun Joung No and Yoonjung An and Yongtae Park},
  keywords = {Knowledge flow},
  keywords = {Business method},
  keywords = {Patent citation analysis},
  keywords = {Text mining},
  keywords = {Technological classes },
  abstract = {Abstract With information and communication technology (ICT) as an enabling platform, diversified new business methods (BMs) have been developed. These new technology-based \{BMs\} have played an important role in knowledge flow as they became a patentable subject matter. However, there are not many studies on knowledge flow through the technology-based \{BMs\} or \{BM\} patents in spite of its importance. As an attempt to provide a deeper understanding of technology-based \{BMs\} with regard to knowledge flow, this paper explores knowledge flows driven by the technology-based \{BMs\} through investigating both cited and citing patents. In order to explore the knowledge flows, this paper proposes an algorithm that utilizes both the citation and textual information of \{BM\} patents. In addition to citation information, text data in patent documents are used to measure the degree of knowledge flow in a more accurate way. A case study is conducted with the \{BM\} patents related to postage metering system and the analysis result is presented in a positioning map that shows different knowledge flow patterns of technological classes. Moreover, the technology-based \{BM\} patents as knowledge flow drivers are classified based on the amount of knowledge exchanged between the base \{BM\} patents and their patent citations. }
}
@article{Sanmiquel201549,
  title = {Study of Spanish mining accidents using data mining techniques },
  journal = {Safety Science },
  volume = {75},
  number = {0},
  pages = {49 - 55},
  year = {2015},
  note = {},
  issn = {0925-7535},
  doi = {http://dx.doi.org/10.1016/j.ssci.2015.01.016},
  url = {http://www.sciencedirect.com/science/article/pii/S092575351500017X},
  author = {Lluís Sanmiquel and Josep M. Rossell and Carla Vintró},
  keywords = {Mining accidents},
  keywords = {Data mining},
  keywords = {Bayesian network},
  keywords = {Classification methods },
  abstract = {Abstract Mining is an economic sector with a high number of accidents. Mines are hazardous places and workers can suffer a wide variety of injuries. Utilizing a database composed of almost 70,000 occupational accidents and fatality reports corresponding to the decade 2003–2012 in the Spanish mining sector, the paper analyzes the main causes of those accidents. To carry out the study, powerful statistical tools have been applied, such as Bayesian classifiers, decision trees or contingency tables, among other data mining techniques. Statistical analyses have been performed using Weka software and behavioral patterns based on certain rules have been obtained. From these rules, some conclusions are extracted which can help to develop suitable prevention policies to reduce injuries and fatalities. }
}
@article{Noh20154348,
  title = {Keyword selection and processing strategy for applying text mining to patent analysis },
  journal = {Expert Systems with Applications },
  volume = {42},
  number = {9},
  pages = {4348 - 4360},
  year = {2015},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2015.01.050},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417415000652},
  author = {Heeyong Noh and Yeongran Jo and Sungjoo Lee},
  keywords = {Patent analysis},
  keywords = {Text-mining},
  keywords = {Keyword selection},
  keywords = {Keyword processing},
  keywords = {Document clustering },
  abstract = {Abstract Previous studies have applied various methodologies to analyze patent data for technology management, given the advances in data analysis techniques available. In particular, efforts have recently been made to use text-mining (i.e. extracting keywords from patent documents) for patent analysis purposes. The results of these studies may be affected by the keywords selected from the relevant documents – but, despite its importance, the existing literature has seldom explored strategies for selecting and processing keywords from patent documents. The purpose of this research is to fill this research gap by focusing on keyword strategies for applying text-mining to patent data. Specifically, four factors are addressed; (1) which element of the patent documents to adopt for keyword selection, (2) what keyword selection methods to use, (3) how many keywords to select, and (4) how to transform the keyword selection results into an analyzable data format. An experiment based on an orthogonal array of the four factors was designed in order to identify the best strategy, in which the four factors were evaluated and compared through k-means clustering and entropy values. The research findings are expected to offer useful guidelines for how to select and process keywords for patent analysis, and so further increase the reliability and validity of research using text-mining for patent analysis. }
}
@article{Moustafa2015163,
  title = {Efficient mining fuzzy association rules from ubiquitous data streams },
  journal = {Alexandria Engineering Journal },
  volume = {54},
  number = {2},
  pages = {163 - 174},
  year = {2015},
  note = {},
  issn = {1110-0168},
  doi = {http://dx.doi.org/10.1016/j.aej.2015.03.015},
  url = {http://www.sciencedirect.com/science/article/pii/S1110016815000290},
  author = {Amal Moustafa and Badr Abuelnasr and Mohamed Said Abougabal},
  keywords = {Data mining},
  keywords = {Fuzzy association rules},
  keywords = {Fuzzy sets},
  keywords = {Data streams},
  keywords = {Ubiquitous data streams },
  abstract = {Abstract Due to the development in technology, a number of applications such as smart mobile phone, sensor networks and \{GPS\} devices produce huge amount of ubiquitous data in the form of streams. Different from data in traditional static databases, ubiquitous data streams typically arrive continuously in high speed with huge amount, and changing data distribution. Dealing with and extracting useful information from that data is a real challenge. This raises new issues, that need to be considered when developing association rule mining techniques for these data. It should be noted, that data, in the real world, are not represented in binary and numeric forms only, but it may be represented in quantitative values. Thus, using fuzzy sets will be very suitable to handle these values. In this paper the problem of mining fuzzy association rules from ubiquitous data streams is studied, and a novel technique FFP_USTREAM (Fuzzy Frequent Pattern Ubiquitous Streams) is developed. This technique integrates fuzzy concepts with ubiquitous data streams, employing sliding window approach, to mine fuzzy association rules. In addition, the complexity and the efficiency of this technique are discussed. Examples of real data sets are used to test the proposed technique. Further research issues are also suggested. }
}
@article{Xing2015168,
  title = {Participation-based student final performance prediction model through interpretable Genetic Programming: Integrating learning analytics, educational data mining and theory },
  journal = {Computers in Human Behavior },
  volume = {47},
  number = {0},
  pages = {168 - 181},
  year = {2015},
  note = {Learning Analytics, Educational Data Mining and data-driven Educational Decision Making },
  issn = {0747-5632},
  doi = {http://dx.doi.org/10.1016/j.chb.2014.09.034},
  url = {http://www.sciencedirect.com/science/article/pii/S0747563214004865},
  author = {Wanli Xing and Rui Guo and Eva Petakovic and Sean Goggins},
  keywords = {Learning analytics},
  keywords = {Educational data mining},
  keywords = {Prediction},
  keywords = {\{CSCL\}},
  keywords = {Activity theory},
  keywords = {Genetic Programming },
  abstract = {Abstract Building a student performance prediction model that is both practical and understandable for users is a challenging task fraught with confounding factors to collect and measure. Most current prediction models are difficult for teachers to interpret. This poses significant problems for model use (e.g. personalizing education and intervention) as well as model evaluation. In this paper, we synthesize learning analytics approaches, educational data mining (EDM) and \{HCI\} theory to explore the development of more usable prediction models and prediction model representations using data from a collaborative geometry problem solving environment: Virtual Math Teams with Geogebra (VMTwG). First, based on theory proposed by Hrastinski (2009) establishing online learning as online participation, we operationalized activity theory to holistically quantify students’ participation in the \{CSCL\} (Computer-supported Collaborative Learning) course. As a result, 6 variables, Subject, Rules, Tools, Division of Labor, Community, and Object, are constructed. This analysis of variables prior to the application of a model distinguishes our approach from prior approaches (feature selection, Ad-hoc guesswork etc.). The approach described diminishes data dimensionality and systematically contextualizes data in a semantic background. Secondly, an advanced modeling technique, Genetic Programming (GP), underlies the developed prediction model. We demonstrate how connecting the structure of \{VMTwG\} trace data to a theoretical framework and processing that data using the \{GP\} algorithmic approach outperforms traditional models in prediction rate and interpretability. Theoretical and practical implications are then discussed. }
}
@article{Wang20153,
  title = {An approach to rank reviews by fusing and mining opinions based on review pertinence },
  journal = {Information Fusion },
  volume = {23},
  number = {0},
  pages = {3 - 15},
  year = {2015},
  note = {},
  issn = {1566-2535},
  doi = {http://dx.doi.org/10.1016/j.inffus.2014.04.002},
  url = {http://www.sciencedirect.com/science/article/pii/S1566253514000487},
  author = {Jun-ze Wang and Zheng Yan and Laurence T. Yang and Ben-xiong Huang},
  keywords = {Review pertinence},
  keywords = {Review spam},
  keywords = {Retrieval model},
  keywords = {Opinion fusion},
  keywords = {Opinion mining },
  abstract = {Abstract Fusing and mining opinions from reviews posted in webs or social networks is becoming a popular research topic in recent years in order to analyze public opinions on a specific topic or product. Existing research has been focused on extraction, classification and summarization of opinions from reviews in news websites, forums and blogs. An important issue that has not been well studied is the degree of relevance between a review and its corresponding article. Prior work simply divides reviews into two classes: spam and non-spam, neglecting that the non-spam reviews could have different degrees of relevance to the article. In this paper, we propose a notion of “Review Pertinence” to study the degree of this relevance. Unlike usual methods, we measure the pertinence of review by considering not only the similarity between a review and its corresponding article, but also the correlation among reviews. Experiment results based on real data sets collected from a number of popular portal sites show the obvious effectiveness of our method in ranking reviews based on their pertinence, compared with three baseline methods. Thus, our method can be applied to efficiently retrieve reviews for opinion fusion and mining and filter review spam in practice. }
}
@article{Keet201543,
  title = {The Data Mining \{OPtimization\} Ontology },
  journal = {Web Semantics: Science, Services and Agents on the World Wide Web },
  volume = {32},
  number = {0},
  pages = {43 - 53},
  year = {2015},
  note = {},
  issn = {1570-8268},
  doi = {http://dx.doi.org/10.1016/j.websem.2015.01.001},
  url = {http://www.sciencedirect.com/science/article/pii/S1570826815000025},
  author = {C. Maria Keet and Agnieszka Ławrynowicz and Claudia d’Amato and Alexandros Kalousis and Phong Nguyen and Raul Palma and Robert Stevens and Melanie Hilario},
  keywords = {Ontology},
  keywords = {\{OWL\}},
  keywords = {Data mining},
  keywords = {Meta-learning},
  keywords = {Semantic meta-mining },
  abstract = {Abstract The Data Mining \{OPtimization\} Ontology (DMOP) has been developed to support informed decision-making at various choice points of the data mining process. The ontology can be used by data miners and deployed in ontology-driven information systems. The primary purpose for which \{DMOP\} has been developed is the automation of algorithm and model selection through semantic meta-mining that makes use of an ontology-based meta-analysis of complete data mining processes in view of extracting patterns associated with mining performance. To this end, \{DMOP\} contains detailed descriptions of data mining tasks (e.g., learning, feature selection), data, algorithms, hypotheses such as mined models or patterns, and workflows. A development methodology was used for DMOP, including items such as competency questions and foundational ontology reuse. Several non-trivial modeling problems were encountered and due to the complexity of the data mining details, the ontology requires the use of the \{OWL\} 2 \{DL\} profile. \{DMOP\} was successfully evaluated for semantic meta-mining and used in constructing the Intelligent Discovery Assistant, deployed at the popular data mining environment RapidMiner. }
}
@article{Lan2015767,
  title = {Fuzzy utility mining with upper-bound measure },
  journal = {Applied Soft Computing },
  volume = {30},
  number = {0},
  pages = {767 - 777},
  year = {2015},
  note = {},
  issn = {1568-4946},
  doi = {http://dx.doi.org/10.1016/j.asoc.2015.01.055},
  url = {http://www.sciencedirect.com/science/article/pii/S1568494615000769},
  author = {Guo-Cheng Lan and Tzung-Pei Hong and Yi-Hsin Lin and Shyue-Liang Wang},
  keywords = {Data mining},
  keywords = {Fuzzy data mining},
  keywords = {Fuzzy utility mining},
  keywords = {High fuzzy utility itemset},
  keywords = {Upper bound },
  abstract = {Abstract Fuzzy utility mining has been an emerging research issue because of its simplicity and comprehensibility. Different from traditional fuzzy data mining, fuzzy utility mining considers not only quantities of items in transactions but also their profits for deriving high fuzzy utility itemsets. In this paper, we introduce a new fuzzy utility measure with the fuzzy minimum operator to evaluate the fuzzy utilities of itemsets. Besides, an effective fuzzy utility upper-bound model based on the proposed measure is designed to provide the downward-closure property in fuzzy sets, thus reducing the search space of finding high fuzzy utility itemsets. A two-phase fuzzy utility mining algorithm, named TPFU, is also proposed and described for solving the problem of fuzzy utility mining. At last, the experimental results on both synthetic and real datasets show that the proposed algorithm has good performance. }
}
@article{Yang2015158,
  title = {Countering the concept-drift problems in big data by an incrementally optimized stream mining model },
  journal = {Journal of Systems and Software },
  volume = {102},
  number = {0},
  pages = {158 - 166},
  year = {2015},
  note = {},
  issn = {0164-1212},
  doi = {http://dx.doi.org/10.1016/j.jss.2014.07.010},
  url = {http://www.sciencedirect.com/science/article/pii/S0164121214001526},
  author = {Hang Yang and Simon Fong},
  keywords = {Concept drift},
  keywords = {Data stream mining},
  keywords = {Very fast decision tree },
  abstract = {Abstract Mining the potential value hidden behind big data has been a popular research topic around the world. For an infinite big data scenario, the underlying data distribution of newly arrived data may be appeared differently from the old one in the real world. This phenomenon is so-called the concept-drift problem that exists commonly in the scenario of big data mining. In the past decade, decision tree inductions use multi-tree learning to detect the drift using alternative trees as a solution. However, multi-tree algorithms consume more computing resources than the singletree. This paper proposes a singletree with an optimized node-splitting mechanism to detect the drift in a test-then-training tree-building process. In the experiment, we compare the performance of the new method to some state-of-art singletree and multi-tree algorithms. Result shows that the new algorithm performs with good accuracy while a more compact model size and less use of memory than the others. }
}
@article{Hammer201561,
  title = {An acoustic position estimation prototype system for underground mining safety },
  journal = {Applied Acoustics },
  volume = {92},
  number = {0},
  pages = {61 - 74},
  year = {2015},
  note = {},
  issn = {0003-682X},
  doi = {http://dx.doi.org/10.1016/j.apacoust.2014.12.009},
  url = {http://www.sciencedirect.com/science/article/pii/S0003682X14003168},
  author = {F. Hammer and M. Pichler and H. Fenzl and A. Gebhard and C. Hesch},
  keywords = {Position estimation},
  keywords = {Acoustic localization},
  keywords = {Hyperbolic frequency modulated signal},
  keywords = {Underground mining },
  abstract = {Abstract The surroundings of underground mining machines represent a hazardous zone for miners due to bad visibility conditions for the engine driver. Within the EU-funded project FEATureFACE, we have developed a prototype system for the estimation of the miners’ positions around a machine employing time-of-flight measurements based on audible sound signals. We aimed at 2D-localization within a range of 10 m, and at the determination of a miner’s distance within a range of 50 m. Our system consists of a base station located at the machine’s side that comprises a set of six loudspeakers, and mobile tags that are worn by the miners. Individual sound signals are emitted by the loudspeakers and received at a mobile tag via a microphone that is mounted on the miner’s hard hat. Our system not only provides continuous estimates of the miners’ positions, but also yields estimates of the velocity and direction of their movement. We have evaluated the performance of our system in a parking-garage and in a training-mine. With regard to the stationary localization, our results show that our system provides an accuracy down to below 25 cm and a precision lower than 2 cm. }
}
@incollection{Kotu2015275,
  title = {Chapter 9 - Text Mining },
  editor = {Deshpande, Vijay KotuBala },
  booktitle = {Predictive Analytics and Data Mining },
  publisher = {Morgan Kaufmann},
  edition = {},
  address = {Boston},
  year = {2015},
  pages = {275 - 303},
  isbn = {978-0-12-801460-8},
  doi = {http://dx.doi.org/10.1016/B978-0-12-801460-8.00009-4},
  url = {http://www.sciencedirect.com/science/article/pii/B9780128014608000094},
  author = {Vijay Kotu and Bala Deshpande},
  keywords = {Inverse document frequency},
  keywords = {keyword clustering},
  keywords = {n-grams},
  keywords = {stemming},
  keywords = {stop word filtering},
  keywords = {term frequency},
  keywords = {text analytics},
  keywords = {text classification},
  keywords = {text mining},
  keywords = {tokenization },
  abstract = {Abstract This chapter provides a detailed look into the emerging area of text mining and text analytics. It starts with a background of the origins of text mining and provides the motivation for this fascinating topic using the example of IBM's Watson, the Jeopardy!-winning computer program that was built almost entirely using concepts from text and data mining. The chapter introduces some key concepts important in the area of text analytics such as TF-IDF scores. Finally it describes two hands-on case studies in which the reader is shown how to use RapidMiner to address problems like document clustering and automatic gender classification based on text content. }
}
@article{Turan2015169,
  title = {Automatize Document Topic and Subtopic Detection with Support of a Corpus },
  journal = {Procedia - Social and Behavioral Sciences },
  volume = {177},
  number = {0},
  pages = {169 - 177},
  year = {2015},
  note = {First Global Conference on Contemporary Issues in Education (GLOBE-EDU 2014) 12-14 July 2014, Las Vegas, \{USA\} },
  issn = {1877-0428},
  doi = {http://dx.doi.org/10.1016/j.sbspro.2015.02.373},
  url = {http://www.sciencedirect.com/science/article/pii/S1877042815017279},
  author = {Metin Turan and Coskun Sönmez},
  keywords = {topic detection},
  keywords = {text mining},
  keywords = {document summarization },
  abstract = {Abstract In this article, we propose a new automatic topic and subtopic detection method from a document called paragraph extension. In paragraph extension, a document is considered as a set of paragraphs and a paragraph merging technique is used to merge similar consecutive paragraphs until no similar consecutive paragraphs left. Following this, similar word counts in merged paragraphs are summed up to construct subtopic scores by using a corpus which is designed so that we can find words related to a subtopic. The paragraph vectors are represented by subtopics instead of the words. The subtopic of a paragraph is the most frequent one in the paragraph vector. On the other hand, topic of the document is the most dispersive subtopic in the document. An experimental topic/subtopic corpus is constructed for sport and education topics. We also supported corpus by WordNet to obtain synonyms words. We evaluate the proposed method on a data set contains randomly selected 40 documents from the education and sport topics. The experiment results show that average of topic detection success ratio is about %83 and the subtopic detection is about %68. }
}
@article{Schuh201532,
  title = {On visualization techniques for solar data mining },
  journal = {Astronomy and Computing },
  volume = {10},
  number = {0},
  pages = {32 - 42},
  year = {2015},
  note = {},
  issn = {2213-1337},
  doi = {http://dx.doi.org/10.1016/j.ascom.2014.12.003},
  url = {http://www.sciencedirect.com/science/article/pii/S2213133714000705},
  author = {M.A. Schuh and J.M. Banda and T. Wylie and P. McInerney and K. Ganesan Pillai and R.A. Angryk},
  keywords = {Solar images},
  keywords = {Visualization},
  keywords = {Data mining},
  keywords = {\{CBIR\} },
  abstract = {Abstract Large-scale data mining is often aided with graphic visualizations to facilitate a better understanding of the data and results. This is especially true for visual data and highly detailed data too complex to be easily understood in raw forms. In this work, we present several of our recent interdisciplinary works in data mining solar image repositories and discuss the over-arching need for effective visualizations of data, metadata, and results along the way. First, we explain the complex characteristics and overwhelming abundance of image data being produced by NASA’s Solar Dynamics Observatory (SDO). Then we discuss the wide scope of solar data mining and highlight visual results from work in data labeling, classification, and clustering. Lastly, we present an overview of the first-ever Content-Based Image Retrieval (CBIR) system for solar images, and conclude with a brief look at the direction of our future research. }
}
@article{Gao2014175,
  title = {Employers’ Expectations: A Probabilistic Text Mining Model },
  journal = {Procedia Engineering },
  volume = {85},
  number = {0},
  pages = {175 - 182},
  year = {2014},
  note = {Selected papers from Creative Construction Conference 2014 },
  issn = {1877-7058},
  doi = {http://dx.doi.org/10.1016/j.proeng.2014.10.542},
  url = {http://www.sciencedirect.com/science/article/pii/S1877705814019080},
  author = {Lu Gao and Neil Eldin},
  keywords = {Construction Management Skills},
  keywords = {Employer Expectation},
  keywords = {Text Mining},
  keywords = {Topic Model },
  abstract = {Abstract This study uses text mining techniques to analyze employment data posted over the internet. The objective is to identify knowledge areas, skills and expertise relevant to jobs in the construction industry. We utilized the fast growing online job search engines to understand the construction job market and employer expectations. Over 20,000 job advertisements were downloaded from various websites between Oct 14th 2012 and March 15th 2013. We developed a text mining method to identify derived job qualification information from the downloaded pages. The developed algorithm is capable to derive rules by automatically extracting statistically significant patterns present inside preselected qualifications. The selection rules can then be used to detect the presence of these qualifications in new pages. Once the qualifications are identified, we used the Latent Dirichlet Allocation (LDA) model to identify groups of skills that are required by employers. One of the major advantages of implementing \{LDA\} model is that it is an unsupervised approach and no training is needed. The algorithm was applied to a case study as an illustrative example. }
}
@article{Arbelaitz20137478,
  title = {Web usage and content mining to extract knowledge for modelling the users of the Bidasoa Turismo website and to adapt it },
  journal = {Expert Systems with Applications },
  volume = {40},
  number = {18},
  pages = {7478 - 7491},
  year = {2013},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2013.07.040},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417413005198},
  author = {Olatz Arbelaitz and Ibai Gurrutxaga and Aizea Lojo and Javier Muguerza and Jesús Maria Pérez and Iñigo Perona},
  keywords = {Bidasoa tourism website},
  keywords = {Web usage mining},
  keywords = {Web content mining},
  keywords = {Web user profiling},
  keywords = {Clustering},
  keywords = {Frequent pattern mining},
  keywords = {Topic modelling },
  abstract = {Abstract The tourism industry has experienced a shift from offline to online travellers and this has made the use of intelligent systems in the tourism sector crucial. These information systems should provide tourism consumers and service providers with the most relevant information, more decision support, greater mobility and the most enjoyable travel experiences. As a consequence, Destination Marketing Organizations (DMOs) not only have to respond by adopting new technologies, but also by interpreting and using the knowledge created by the use of these techniques. This work presents the design of a general and non-invasive web mining system, built using the minimum information stored in a web server (the content of the website and the information from the log files stored in Common Log Format (CLF)) and its application to the Bidasoa Turismo (BTw) website. The proposed system combines web usage and content mining techniques with the three following main objectives: generating user navigation profiles to be used for link prediction; enriching the profiles with semantic information to diversify them, which provides the \{DMO\} with a tool to introduce links that will match the users taste; and moreover, obtaining global and language-dependent user interest profiles, which provides the \{DMO\} staff with important information for future web designs, and allows them to design future marketing campaigns for specific targets. The system performed successfully, obtaining profiles which fit in more than 60% of cases with the real user navigation sequences and in more than 90% of cases with the user interests. Moreover the automatically extracted semantic structure of the website and the interest profiles were validated by the \{BTw\} \{DMO\} staff, who found the knowledge provided to be very useful for the future. }
}
@article{Soysal20152582,
  title = {Association rule mining with mostly associated sequential patterns },
  journal = {Expert Systems with Applications },
  volume = {42},
  number = {5},
  pages = {2582 - 2592},
  year = {2015},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2014.10.049},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417414006812},
  author = {Ömer M. Soysal},
  keywords = {Association rule mining},
  keywords = {Interesting rules},
  keywords = {Pattern recognition},
  keywords = {Big data},
  keywords = {Knowledge discovery},
  keywords = {Data mining },
  abstract = {Abstract In this paper, we address the problem of mining structured data to find potentially useful patterns by association rule mining. Different than the traditional find-all-then-prune approach, a heuristic method is proposed to extract mostly associated patterns (MASPs). This approach utilizes a maximally-association constraint to generate patterns without searching the entire lattice of item combinations. This approach does not require a pruning process. The proposed approach requires less computational resources in terms of time and memory requirements while generating a long sequence of patterns that have the highest co-occurrence. Furthermore, k-item patterns can be obtained thanks to the sub-lattice property of the MASPs. In addition, the algorithm produces a tree of the detected patterns; this tree can assist decision makers for visual analysis of data. The outcome of the algorithm implemented is illustrated using traffic accident data. The proposed approach has a potential to be utilized in big data analytics. }
}
@article{Amo2015182,
  title = {Contextual preference mining for user profile construction },
  journal = {Information Systems },
  volume = {49},
  number = {0},
  pages = {182 - 199},
  year = {2015},
  note = {},
  issn = {0306-4379},
  doi = {http://dx.doi.org/10.1016/j.is.2014.11.009},
  url = {http://www.sciencedirect.com/science/article/pii/S0306437914001859},
  author = {Sandra de Amo and Mouhamadou Saliou Diallo and Cheikh Talibouya Diop and Arnaud Giacometti and Dominique Li and Arnaud Soulet},
  keywords = {Pattern mining},
  keywords = {Preference elicitation},
  keywords = {Contextual preference rule},
  keywords = {User profile mining },
  abstract = {Abstract The emerging of ubiquitous computing technologies in recent years has given rise to a new field of research consisting in incorporating context-aware preference querying facilities in database systems. One important step in this setting is the Preference Elicitation task which consists in providing the user ways to inform his/her choice on pairs of objects with a minimal effort. In this paper we propose an automatic preference elicitation method based on mining techniques. The method consists in extracting a user profile from a set of user preference samples. In our setting, a profile is specified by a set of contextual preference rules verifying properties of soundness and conciseness. After proving that the problem is NP-complete, we propose a resolution in 2 phases. The first phase extracts all individual user preferences by means of contextual preference rules. The second phase builds the user profile starting from this collection of rules using a greedy method. To assess the quality of user profiles, we propose three ranking techniques benefiting from these profiles that enable us to rank objects according to user preferences. We evaluate the efficacy of our three ranking strategies and compare them with a well-known ranking method (SVMRank). The evaluation is carried out through an extensive set of experiments executed on a real-world database of user preferences about movies. }
}
@article{Moro20151314,
  title = {Business intelligence in banking: A literature analysis from 2002 to 2013 using text mining and latent Dirichlet allocation },
  journal = {Expert Systems with Applications },
  volume = {42},
  number = {3},
  pages = {1314 - 1324},
  year = {2015},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2014.09.024},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417414005636},
  author = {Sérgio Moro and Paulo Cortez and Paulo Rita},
  keywords = {Banking},
  keywords = {Business intelligence},
  keywords = {Data mining},
  keywords = {Text mining},
  keywords = {Decision support systems },
  abstract = {Abstract This paper analyzes recent literature in the search for trends in business intelligence applications for the banking industry. Searches were performed in relevant journals resulting in 219 articles published between 2002 and 2013. To analyze such a large number of manuscripts, text mining techniques were used in pursuit for relevant terms on both business intelligence and banking domains. Moreover, the latent Dirichlet allocation modeling was used in order to group articles in several relevant topics. The analysis was conducted using a dictionary of terms belonging to both banking and business intelligence domains. Such procedure allowed for the identification of relationships between terms and topics grouping articles, enabling to emerge hypotheses regarding research directions. To confirm such hypotheses, relevant articles were collected and scrutinized, allowing to validate the text mining procedure. The results show that credit in banking is clearly the main application trend, particularly predicting risk and thus supporting credit approval or denial. There is also a relevant interest in bankruptcy and fraud prediction. Customer retention seems to be associated, although weakly, with targeting, justifying bank offers to reduce churn. In addition, a large number of articles focused more on business intelligence techniques and its applications, using the banking industry just for evaluation, thus, not clearly acclaiming for benefits in the banking business. By identifying these current research topics, this study also highlights opportunities for future research. }
}
@article{Wang201594,
  title = {Research on a frequent maximal induced subtrees mining method based on the compression tree sequence },
  journal = {Expert Systems with Applications },
  volume = {42},
  number = {1},
  pages = {94 - 100},
  year = {2015},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2014.07.053},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417414004692},
  author = {Jing Wang and Zhaojun Liu and Wei Li and Xiongfei Li},
  keywords = {Data mining},
  keywords = {Frequent subtree},
  keywords = {Induced subtree},
  keywords = {Maximal subtree},
  keywords = {Compression},
  keywords = {\{CFMIS\} },
  abstract = {Abstract Most complex data structures can be represented by a tree or graph structure, but tree structure mining is easier than graph structure mining. With the extensive application of semi-structured data, frequent tree pattern mining has become a hot topic. This paper proposes a compression tree sequence (CTS) to construct a compression tree model; and save the information of the original tree in the compression tree. As any subsequence of the \{CTS\} corresponds to a subtree of the original tree, it is efficient for mining subtrees. Furthermore, this paper proposes a frequent maximal induced subtrees mining method based on the compression tree sequence, \{CFMIS\} (compressed frequent maximal induced subtrees). The algorithm is primarily performed via four stages: firstly, the original data set is constructed as a compression tree model; then, a cut-edge reprocess is run for the edges in which the edge frequent is less than the threshold; next, the tree is compressed after the cut-edge based on the different frequent edge degrees; and, last, frequent subtree sets maximal processing is run such that, we can obtain the frequent maximal induced subtree set of the original data set. For each iteration, compression can reduce the size of the data set, thus, the traversal speed is faster than that of other algorithms. Experiments demonstrate that our algorithm can mine more frequent maximal induced subtrees in less time. }
}
@article{PletscherFrankild201583,
  title = {DISEASES: Text mining and data integration of disease–gene associations },
  journal = {Methods },
  volume = {74},
  number = {0},
  pages = {83 - 89},
  year = {2015},
  note = {Text mining of biomedical literature },
  issn = {1046-2023},
  doi = {http://dx.doi.org/10.1016/j.ymeth.2014.11.020},
  url = {http://www.sciencedirect.com/science/article/pii/S1046202314003831},
  author = {Sune Pletscher-Frankild and Albert Pallejà and Kalliopi Tsafou and Janos X. Binder and Lars Juhl Jensen},
  keywords = {Text mining},
  keywords = {Named entity recognition},
  keywords = {Information extraction},
  keywords = {Data integration},
  keywords = {Web resource },
  abstract = {Abstract Text mining is a flexible technology that can be applied to numerous different tasks in biology and medicine. We present a system for extracting disease–gene associations from biomedical abstracts. The system consists of a highly efficient dictionary-based tagger for named entity recognition of human genes and diseases, which we combine with a scoring scheme that takes into account co-occurrences both within and between sentences. We show that this approach is able to extract half of all manually curated associations with a false positive rate of only 0.16%. Nonetheless, text mining should not stand alone, but be combined with other types of evidence. For this reason, we have developed the \{DISEASES\} resource, which integrates the results from text mining with manually curated disease–gene associations, cancer mutation data, and genome-wide association studies from existing databases. The \{DISEASES\} resource is accessible through a web interface at http://diseases.jensenlab.org/, where the text-mining software and all associations are also freely available for download. }
}
@article{Chen201558,
  title = {Visualizing market structure through online product reviews: Integrate topic modeling, TOPSIS, and multi-dimensional scaling approaches },
  journal = {Electronic Commerce Research and Applications },
  volume = {14},
  number = {1},
  pages = {58 - 74},
  year = {2015},
  note = {},
  issn = {1567-4223},
  doi = {http://dx.doi.org/10.1016/j.elerap.2014.11.004},
  url = {http://www.sciencedirect.com/science/article/pii/S156742231400088X},
  author = {Kun Chen and Gang Kou and Jennifer Shang and Yang Chen},
  keywords = {Market structure},
  keywords = {Text mining},
  keywords = {Topic modeling},
  keywords = {Ranking of products},
  keywords = {\{TOPSIS\} },
  abstract = {Abstract Studies have shown that perceptual maps derived from online consumer-generated data are effective for depicting market structure such as demonstrating positioning of competitive brands. However, most text mining algorithms would require manual reading to merge extracted product features with synonyms. In response, Topic modeling is introduced to group synonyms together under a topic automatically, leading to convenient and accurate evaluation of brands based on consumers’ online reviews. To ensure the feasibility of employing Topic modeling in assessing competitive brands, we developed a unique and novel framework named \{WVAP\} (Weights from Valid Posterior Probability) based on Scree plot technique. \{WVAP\} can filter the noises in posterior distribution obtained from Topic modeling, and improve accuracy in brand evaluation. A case study exploring online reviews of mobile phones is conducted. We extract topics to reflect the features of the cell phones with a qualified validity. In addition to perceptual maps derived by multi-dimensional scaling (MDS) for product positioning, we also rank these products by \{TOPSIS\} (Technique for Order Performance by Similarity to Ideal Solution) so as to visualize the market structure from different perspectives. Our case study of cell phones shows that the proposed framework is effective in mining online reviews and providing insights into the competitive landscape. }
}
@article{Chemchem20151436,
  title = {From data mining to knowledge mining: Application to intelligent agents },
  journal = {Expert Systems with Applications },
  volume = {42},
  number = {3},
  pages = {1436 - 1445},
  year = {2015},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2014.08.024},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417414005065},
  author = {Amine Chemchem and Habiba Drias},
  keywords = {Knowledge mining},
  keywords = {Induction rules},
  keywords = {Classification},
  keywords = {Clustering},
  keywords = {Cognitive agent },
  abstract = {Abstract The last decade, the computers world became a huge wave of data. Data mining tasks were invoked to tackle this problem in order to extract the interesting knowledge. The recent emergence of some data mining techniques provide also many interesting induction rules. So, it is judicious now to process these induction rules in order to extract some new strong patterns called meta-rules. This work explores this concept by proposing a new support for induction rules clustering and classification. The approach invokes k-means and k-nn algorithms to mine induction rules using new designed similarity measures and gravity center computation. The developed module have been implemented in the core of the cognitive agent, in order to speed up its reasoning. This new architecture called the Miner Intelligent Agent (MIA) is tested and evaluated on four public benchmarks that contain 25,000 rules, and finally it is compared to the classical one. As foreseeable, the \{MIA\} outperforms clearly the classical cognitive agent performances. }
}
@article{Thorleuchter201525,
  title = {Idea mining for web-based weak signal detection },
  journal = {Futures },
  volume = {66},
  number = {0},
  pages = {25 - 34},
  year = {2015},
  note = {},
  issn = {0016-3287},
  doi = {http://dx.doi.org/10.1016/j.futures.2014.12.007},
  url = {http://www.sciencedirect.com/science/article/pii/S0016328714002018},
  author = {D. Thorleuchter and D. Van den Poel},
  keywords = {Web mining},
  keywords = {Strategic decision making},
  keywords = {Idea mining},
  keywords = {Weak signal analysis },
  abstract = {Abstract We investigate the impact of idea mining filtering on web-based weak signal detection to improve strategic decision making. Existing approaches for identifying weak signals in strategic decision making use environmental scanning procedures based on standard filtering algorithms. These algorithms discard patterns with low information content; however, they are not able to discard patterns with low relevance to a given strategic problem. Idea mining is proposed as an algorithm that identifies relevant textual patterns from documents or websites to solve a given (strategic) problem. Thus, it enables to estimate patterns’ relevance to the given strategic problem. The provided new methodology that combines weak signal analysis and idea mining is in contrast to existing methodologies. In a case study, a web-based scanning procedure is implemented to identify textual internet data in the field of self-sufficient energy supply. Idea mining is applied for filtering and weak signals are identified based on the proposed approach. The proposed approach is compared to a further – already evaluated – approach processed without using idea mining. The results show that idea mining filtering improves quality of weak signal analysis. This supports decision makers by providing early and suggestive signals of potentially emerging trends, even with only little expressive strength. }
}
@article{Cha201597,
  title = {Mining web-based data to assess public response to environmental events },
  journal = {Environmental Pollution },
  volume = {198},
  number = {0},
  pages = {97 - 99},
  year = {2015},
  note = {},
  issn = {0269-7491},
  doi = {http://dx.doi.org/10.1016/j.envpol.2014.12.027},
  url = {http://www.sciencedirect.com/science/article/pii/S0269749114005272},
  author = {YoonKyung Cha and Craig A. Stow},
  keywords = {Twitter},
  keywords = {Google trends},
  keywords = {Social media},
  keywords = {Web search trends},
  keywords = {Data mining},
  keywords = {Algal blooms},
  keywords = {Public perception and interest },
  abstract = {Abstract We explore how the analysis of web-based data, such as Twitter and Google Trends, can be used to assess the social relevance of an environmental accident. The concept and methods are applied in the shutdown of drinking water supply at the city of Toledo, Ohio, USA. Toledo's notice, which persisted from August 1 to 4, 2014, is a high-profile event that directly influenced approximately half a million people and received wide recognition. The notice was given when excessive levels of microcystin, a byproduct of cyanobacteria blooms, were discovered at the drinking water treatment plant on Lake Erie. Twitter mining results illustrated an instant response to the Toledo incident, the associated collective knowledge, and public perception. The results from Google Trends, on the other hand, revealed how the Toledo event raised public attention on the associated environmental issue, harmful algal blooms, in a long-term context. Thus, when jointly applied, Twitter and Google Trend analysis results offer complementary perspectives. Web content aggregated through mining approaches provides a social standpoint, such as public perception and interest, and offers context for establishing and evaluating environmental management policies. }
}
@article{Soldatos20153,
  title = {How to learn about gene function: text-mining or ontologies? },
  journal = {Methods },
  volume = {74},
  number = {0},
  pages = {3 - 15},
  year = {2015},
  note = {Text mining of biomedical literature },
  issn = {1046-2023},
  doi = {http://dx.doi.org/10.1016/j.ymeth.2014.07.004},
  url = {http://www.sciencedirect.com/science/article/pii/S1046202314002412},
  author = {Theodoros G. Soldatos and Nelson Perdigão and Nigel P. Brown and Kenneth S. Sabir and Seán I. O’Donoghue},
  keywords = {Functional annotation},
  keywords = {Text mining},
  keywords = {Keyword enhancement},
  keywords = {\{GO\} term enrichment},
  keywords = {Systems biology},
  keywords = {Benchmarks },
  abstract = {Abstract As the amount of genome information increases rapidly, there is a correspondingly greater need for methods that provide accurate and automated annotation of gene function. For example, many high-throughput technologies – e.g., next-generation sequencing – are being used today to generate lists of genes associated with specific conditions. However, their functional interpretation remains a challenge and many tools exist trying to characterize the function of gene-lists. Such systems rely typically in enrichment analysis and aim to give a quick insight into the underlying biology by presenting it in a form of a summary-report. While the load of annotation may be alleviated by such computational approaches, the main challenge in modern annotation remains to develop a systems form of analysis in which a pipeline can effectively analyze gene-lists quickly and identify aggregated annotations through computerized resources. In this article we survey some of the many such tools and methods that have been developed to automatically interpret the biological functions underlying gene-lists. We overview current functional annotation aspects from the perspective of their epistemology (i.e., the underlying theories used to organize information about gene function into a body of verified and documented knowledge) and find that most of the currently used functional annotation methods fall broadly into one of two categories: they are based either on ‘known’ formally-structured ontology annotations created by ‘experts’ (e.g., the \{GO\} terms used to describe the function of Entrez Gene entries), or – perhaps more adventurously – on annotations inferred from literature (e.g., many text-mining methods use computer-aided reasoning to acquire knowledge represented in natural languages). Overall however, deriving detailed and accurate insight from such gene lists remains a challenging task, and improved methods are called for. In particular, future methods need to (1) provide more holistic insight into the underlying molecular systems; (2) provide better follow-up experimental testing and treatment options, and (3) better manage gene lists derived from organisms that are not well-studied. We discuss some promising approaches that may help achieve these advances, especially the use of extended dictionaries of biomedical concepts and molecular mechanisms, as well as greater use of annotation benchmarks. }
}
@article{Yun20151149,
  title = {A fast perturbation algorithm using tree structure for privacy preserving utility mining },
  journal = {Expert Systems with Applications },
  volume = {42},
  number = {3},
  pages = {1149 - 1165},
  year = {2015},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2014.08.037},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417414005193},
  author = {Unil Yun and Jiwon Kim},
  keywords = {Privacy preserving},
  keywords = {Utility pattern mining},
  keywords = {Perturbation},
  keywords = {Frequent pattern mining},
  keywords = {Data mining },
  abstract = {Abstract As one of the important approaches in privacy preserving data mining, privacy preserving utility mining has been studied to find more meaningful results while database privacy is ensured and to improve algorithm efficiency by integrating fundamental utility pattern mining and privacy preserving data mining methods. However, its previous approaches require a significant amount of time to protect the privacy of data holders because they conduct database scanning operations excessively many times until all important information is hidden. Moreover, as the size of a given database becomes larger and a user-specified minimum utility threshold becomes lower, their performance degradation may be so uncontrollable that they cannot operate normally. To solve this problem, we propose a fast perturbation algorithm based on a tree structure which more quickly performs database perturbation processes for preventing sensitive information from being exposed. We also present extensive experimental results between our proposed method and state-of-the-art algorithms using both real and synthetic datasets. They show the proposed method has not only outstanding privacy preservation performance that is comparable to the previous ones but also 5–10 times faster runtime than that of the existing approaches on average. In addition, the proposed algorithm guarantees better scalability than that of the latest ones with respect to databases with the characteristics of gradually increasing attributes and transactions. }
}
@article{Fontaine201590,
  title = {Assessment of curated phenotype mining in neuropsychiatric disorder literature },
  journal = {Methods },
  volume = {74},
  number = {0},
  pages = {90 - 96},
  year = {2015},
  note = {Text mining of biomedical literature },
  issn = {1046-2023},
  doi = {http://dx.doi.org/10.1016/j.ymeth.2014.11.022},
  url = {http://www.sciencedirect.com/science/article/pii/S1046202314003855},
  author = {Jean-Fred Fontaine and Josef Priller and Eike Spruth and Carol Perez-Iratxeta and Miguel A. Andrade-Navarro},
  keywords = {Data mining},
  keywords = {Text mining},
  keywords = {Neuropsychiatric disorders},
  keywords = {Clinical diagnostics},
  keywords = {Data curation},
  keywords = {Drug therapy },
  abstract = {Abstract Clinical evaluation of patients and diagnosis of disorder is crucial to make decisions on appropriate therapies. In addition, in the case of genetic disorders resulting from gene abnormalities, phenotypic effects may guide basic research on the mechanisms of a disorder to find the mutated gene and therefore to propose novel targets for drug therapy. However, this approach is complicated by two facts. First, the relationship between genes and disorders is not simple: one gene may be related to multiple disorders and a disorder may be caused by mutations in different genes. Second, recognizing relevant phenotypes might be difficult for clinicians working with patients of closely related complex disorders. Neuropsychiatric disorders best illustrate these difficulties since phenotypes range from metabolic to behavioral aspects, the latter extremely complex. Based on our clinical expertise on five neurodegenerative disorders, and from the wealth of bibliographical data on neuropsychiatric disorders, we have built a resource to infer associations between genes, chemicals, phenotypes for a total of 31 disorders. An initial step of automated text mining of the literature related to 31 disorders returned thousands of enriched terms. Fewer relevant phenotypic terms were manually selected by clinicians as relevant to the five neural disorders of their expertise and used to analyze the complete set of disorders. Analysis of the data indicates general relationships between neuropsychiatric disorders, which can be used to classify and characterize them. Correlation analyses allowed us to propose novel associations of genes and drugs with disorders. More generally, the results led us to uncovering mechanisms of disease that span multiple neuropsychiatric disorders, for example that genes related to synaptic transmission and receptor functions tend to be involved in many disorders, whereas genes related to sensory perception and channel transport functions are associated with fewer disorders. Our study shows that starting from expertise covering a limited set of neurological disorders and using text and data mining methods, meaningful and novel associations regarding genes, chemicals and phenotypes can be derived for an expanded set of neuropsychiatric disorders. Our results are intended for clinicians to help them evaluate patients, and for basic scientists to propose new gene targets for drug therapies. This strategy can be extended to virtually all diseases and takes advantage of the ever increasing amount of biomedical literature. }
}
@article{Amornsinlaphachai201527,
  title = {The Design of a Framework for Cooperative Learning through Web Utilizing Data Mining Technique to Group Learners },
  journal = {Procedia - Social and Behavioral Sciences },
  volume = {174},
  number = {0},
  pages = {27 - 33},
  year = {2015},
  note = {International Conference on New Horizons in Education, \{INTE\} 2014, 25-27 June 2014, Paris, France },
  issn = {1877-0428},
  doi = {http://dx.doi.org/10.1016/j.sbspro.2015.01.622},
  url = {http://www.sciencedirect.com/science/article/pii/S1877042815006734},
  author = {Pensri Amornsinlaphachai},
  keywords = {data mining},
  keywords = {cooperative learning},
  keywords = {constructivist },
  abstract = {Abstract The purpose of this research is to design of a framework for cooperative learning through web utilizing data mining technique to group learners. The research and development methodology is employed in this study; however simply the framework design phase is presented here. The design phase composes of 5 steps as follows: 1) Studying and analyzing the related principles and theories, 2) Studying the context of learning environments, 3) Designing a framework for cooperative learning through web using data mining technique to group learners, 4) Assessing the framework by six experts and 5) Improving the framework. Many theories and principles are employed in this research; for instance, data mining technique, constructivist theory and the principle of media symbol system. Two results are revealed as follows. Firstly, the learning model consists 5 components that are (1) forecasting and grouping module, (2) cooperative learning community, (3) expert community, (4) learning resources and (5) quiz module. Secondly, the results of the evaluation from 6 experts are expressed that the framework comply with computer science principles and learning theories and the experts accept to the usability of the framework in a high level at 71.15 percents overall. To summarize, the framework can be employed to design and develop a learning model appropriately. }
}
@article{Wang2015182,
  title = {Exploring technological opportunities by mining the gaps between science and technology: Microalgal biofuels },
  journal = {Technological Forecasting and Social Change },
  volume = {92},
  number = {0},
  pages = {182 - 195},
  year = {2015},
  note = {},
  issn = {0040-1625},
  doi = {http://dx.doi.org/10.1016/j.techfore.2014.07.008},
  url = {http://www.sciencedirect.com/science/article/pii/S0040162514002339},
  author = {Ming-Yeu Wang and Shih-Chieh Fang and Yu-Hsuan Chang},
  keywords = {Science and technology},
  keywords = {Technological opportunity},
  keywords = {Text mining},
  keywords = {Microalgae},
  keywords = {Biofuel},
  keywords = {High-dimensional data },
  abstract = {Abstract The interaction between scientific and technological knowledge facilitates exploration of new technological opportunities; however, gaps between them typically impede exploration of these opportunities. Scientific papers and technological patents record modern and advanced knowledge in scientific discovery and technological development; therefore, comparing their statuses can identify the gaps and explore potential technological opportunities. Because microalgal biofuels are a promising alternative energy resource devoid of territorial land use problems, this study applies text mining and an algorithm that can cluster objects of high-dimensional data to microalgal biofuel papers and patents, and explores their potential technological opportunities. The results demonstrate that a text-based clustering approach is appropriate for identifying scientific and technological applications for microalgal biofuels. The results indicate that microalgal photosynthesis and light utilization have abundant scientific outcomes for technological engineers to potentially apply. Technological opportunities exist in synthesis, harvesting, extraction, and lipid conversion. Scientific knowledge underlying biofuels accompanying high-value co-products of production require sustained exploration and reporting through research. These needs represent potential technological opportunities. }
}
@article{Lausch20155,
  title = {Data mining and linked open data – New perspectives for data analysis in environmental research },
  journal = {Ecological Modelling },
  volume = {295},
  number = {0},
  pages = {5 - 17},
  year = {2015},
  note = {Use of ecological indicators in models },
  issn = {0304-3800},
  doi = {http://dx.doi.org/10.1016/j.ecolmodel.2014.09.018},
  url = {http://www.sciencedirect.com/science/article/pii/S0304380014004335},
  author = {Angela Lausch and Andreas Schmidt and Lutz Tischendorf},
  keywords = {Data Mining},
  keywords = {Knowledge Discovery Process},
  keywords = {Data Mining Tools},
  keywords = {Linked Open Data},
  keywords = {Semantic},
  keywords = {RapidMiner },
  abstract = {Abstract The rapid development in information and computer technology has facilitated an extreme increase in the collection and storage of digital data. However, the associated rapid increase in digital data volumes does not automatically correlate with new insights and advances in our understanding of those data. The relatively new technique of data mining offers a promising way to extract knowledge and patterns from large, multidimensional and complex data sets. This paper therefore aims to provide a comprehensive overview of existing data mining techniques and related tools and to illustrate the potential of data mining for different research areas by means of example applications. Despite a number of conventional data mining techniques and methods, these classical approaches are restricted to isolated or “silo” data sets and therefore remain primarily stand alone and specialized in nature. Highly complex and mostly interdisciplinary questions in environmental research cannot be answered sufficiently using isolated or area-based data mining approaches. To this end, the linked open data (LOD) approach will be presented as a new possibility in support of complex and inter-disciplinary data mining analysis. The merit of \{LOD\} will be explained using examples from medicine and environmental research. The advantages of \{LOD\} data mining will be weighed against classical data mining techniques. \{LOD\} offers unique and new possibilities for interdisciplinary data analysis, modeling and projection for multidimensional, complex landscapes and may facilitate new insights and answers to complex environmental questions. Our paper aims to encourage those research scientists which do not have extensive programming and data mining knowledge to take advantage of existing data mining tools, to embrace classical data mining and \{LOD\} approaches in support of gaining more insight and recognizing patterns in highly complex data sets. }
}
@article{Gerrikagoitia201575,
  title = {New Trends of Intelligent E-marketing Based on Web Mining for E-shops },
  journal = {Procedia - Social and Behavioral Sciences },
  volume = {175},
  number = {0},
  pages = {75 - 83},
  year = {2015},
  note = {Proceedings of the 3rd International Conference on Strategic Innovative Marketing (IC-SIM 2014) },
  issn = {1877-0428},
  doi = {http://dx.doi.org/10.1016/j.sbspro.2015.01.1176},
  url = {http://www.sciencedirect.com/science/article/pii/S1877042815012367},
  author = {Jon Kepa Gerrikagoitia and Iñigo Castander and Fidel Rebón and Aurkene Alzua-Sorzabal},
  keywords = {e-marketing},
  keywords = {e-commerce},
  keywords = {web mining},
  keywords = {trends e-marketing},
  keywords = {digital footprint },
  abstract = {Abstract E-marketing is the concentration of all efforts in terms of adapting and developing marketing strategies into the virtual spaces: web, social media… In an e-commerce site, e-marketing must help consumers in their purchase. This requires precise knowledge of the customer's preferences. For this reason, holders of e-shops must find out to whom, to what, how and when to refer to the customer, ergo, to know the “consumer decision journey” and strengthen their engagement. This analysis is obtained when the customer is visiting an e-shop because (s)he leaves a digital footprint that can be used to understand his/her needs, desires and demands as well as to improve web presence. These data can be used for data mining to understand the e-marketing and selling processes in a better way. In this paper a survey of 86 e-shops in Spain is presented. In the conclusions, some ideas for good e-marketing practices related to the buying behaviour analysis of customers are shown. Hence, new trends in e-marketing are suggested from a strategic, tactical and operational level in which different data mining techniques ease the purchase and the engagement. }
}
@incollection{McCue201551,
  title = {Chapter 4 - Process Models for Data Mining and Predictive Analysis },
  editor = {McCue, Colleen },
  booktitle = {Data Mining and Predictive Analysis (Second Edition) },
  publisher = {Butterworth-Heinemann},
  edition = {Second Edition},
  address = {Boston},
  year = {2015},
  pages = {51 - 74},
  isbn = {978-0-12-800229-2},
  doi = {http://dx.doi.org/10.1016/B978-0-12-800229-2.00004-3},
  url = {http://www.sciencedirect.com/science/article/pii/B9780128002292000043},
  author = {Colleen McCue},
  keywords = {\{CIA\} Intelligence Process},
  keywords = {CRoss Industry Standard Process for Data Mining (CRISP-DM)},
  keywords = {SEMMA},
  keywords = {spiral development},
  keywords = {“wicked” problems},
  keywords = {collection},
  keywords = {processing},
  keywords = {dissemination},
  keywords = {deployment },
  abstract = {Abstract Data mining and predictive analytics can best be understood as a process, rather than specific technology, tool, or tradecraft. Chapter 4 includes an overview of four complementary approaches to analysis: the Central Intelligence Agency (CIA) Intelligence Process, the \{CRoss\} Industry Standard Process for Data Mining (CRISP-DM), SEMMA, and the Actionable Mining and Predictive Analysis process developed specifically for the operational public safety and security environment. The Actionable Mining and Predictive Analysis process addresses unique requirements and constraints associated with the applied setting, including data access and availability, public safety-specific evaluation, and the requirement for operationally relevant and actionable output. Data privacy and security also are addressed. }
}
@article{HuynhThiLe2015156,
  title = {An efficient and effective algorithm for mining top-rank-k frequent patterns },
  journal = {Expert Systems with Applications },
  volume = {42},
  number = {1},
  pages = {156 - 164},
  year = {2015},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2014.07.045},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417414004527},
  author = {Quyen Huynh-Thi-Le and Tuong Le and Bay Vo and Bac Le},
  keywords = {Data mining},
  keywords = {Pattern mining},
  keywords = {Top-rank-k frequent patterns},
  keywords = {N-list },
  abstract = {Abstract Frequent pattern mining generates a lot of candidates, which requires a lot of memory usage and mining time. In real applications, a small number of frequent patterns are used. Therefore, the mining of top-rank-k frequent patterns, which limits the number of mined frequent patterns by ranking them in frequency, has received increasing interest. This paper proposes the iNTK algorithm, which is an improved version of the \{NTK\} algorithm, for mining top-rank-k frequent patterns. This algorithm employs an N-list structure to represent patterns. The subsume concept is used to speed up the process of mining top-rank-k patterns. The experiments are conducted to evaluate iNTK and \{NTK\} in terms of mining time and memory usage for eight datasets. The experimental results show that iNTK is more efficient and faster than NTK. }
}
@article{Loglisci2015265,
  title = {Relational mining for discovering changes in evolving networks },
  journal = {Neurocomputing },
  volume = {150, Part A},
  number = {0},
  pages = {265 - 288},
  year = {2015},
  note = {Bioinspired and knowledge based techniques and applications The Vitality of Pattern Recognition and Image Analysis Data Stream Classification and Big Data Analytics Selected papers from the 16th International Conference on Knowledge-Based and Intelligent Information & Engineering Systems (KES 2012) Selected papers from the 6th Iberian Conference on Pattern Recognition and Image Analysis (IbPRIA 2013) },
  issn = {0925-2312},
  doi = {http://dx.doi.org/10.1016/j.neucom.2014.08.079},
  url = {http://www.sciencedirect.com/science/article/pii/S0925231214012995},
  author = {Corrado Loglisci and Michelangelo Ceci and Donato Malerba},
  keywords = {Evolving networks},
  keywords = {Discovery of evolution chains},
  keywords = {Discovery of change patterns},
  keywords = {Change mining in networked data },
  abstract = {Abstract Networks are data structures more and more frequently used for modeling interactions in social and biological phenomena, as well as between various types of devices, tools and machines. They can be either static or dynamic, dependently on whether the modeled interactions are fixed or changeable over time. Static networks have been extensively investigated in data mining, while fewer studies have focused on dynamic networks and how to discover complex patterns in large, evolving networks. In this paper we focus on the task of discovering changes in evolving networks and we overcome some limits of existing methods (i) by resorting to a relational approach for representing networks characterized by heterogeneous nodes and/or heterogeneous relationships, and (ii) by proposing a novel algorithm for discovering changes in the structure of a dynamic network over time. Experimental results and comparisons with existing approaches on real-world datasets prove the effectiveness and efficiency of the proposed solution and provide some insights on the effect of some parameters in discovering and modeling the evolution of the whole network, or a subpart of it. }
}
@article{Liu201278,
  title = {A probabilistic graphical model for topic and preference discovery on social media },
  journal = {Neurocomputing },
  volume = {95},
  number = {0},
  pages = {78 - 88},
  year = {2012},
  note = {Learning from Social Media Network },
  issn = {0925-2312},
  doi = {http://dx.doi.org/10.1016/j.neucom.2011.05.039},
  url = {http://www.sciencedirect.com/science/article/pii/S0925231212001488},
  author = {Lu Liu and Feida Zhu and Lei Zhang and Shiqiang Yang},
  keywords = {Social media mining},
  keywords = {Topic model},
  keywords = {Preference discovery },
  abstract = {Many web applications today thrive on offering services for large-scale multimedia data, e.g., Flickr for photos and YouTube for videos. However, these data, while rich in content, are usually sparse in textual descriptive information. For example, a video clip is often associated with only a few tags. Moreover, the textual descriptions are often overly specific to the video content. Such characteristics make it very challenging to discover topics at a satisfactory granularity on this kind of data. In this paper, we propose a generative probabilistic model named Preference-Topic Model (PTM) to introduce the dimension of user preferences to enhance the insufficient textual information. \{PTM\} is a unified framework to combine the tasks of user preference discovery and document topic mining together. Through modeling user-document interactions, \{PTM\} cannot only discover topics and preferences simultaneously, but also enable them to inform and benefit each other in a unified framework. As a result, \{PTM\} can extract better topics and preferences from sparse data. The experimental results on real-life video application data show that \{PTM\} is superior to \{LDA\} in discovering informative topics and preferences in terms of clustering-based evaluations. Furthermore, the experimental results on \{DBLP\} data demonstrate that \{PTM\} is a general model which can be applied to other kinds of user–document interactions. }
}
@article{D’Oca2015395,
  title = {Occupancy schedules learning process through a data mining framework },
  journal = {Energy and Buildings },
  volume = {88},
  number = {0},
  pages = {395 - 408},
  year = {2015},
  note = {},
  issn = {0378-7788},
  doi = {http://dx.doi.org/10.1016/j.enbuild.2014.11.065},
  url = {http://www.sciencedirect.com/science/article/pii/S0378778814010329},
  author = {Simona D’Oca and Tianzhen Hong},
  keywords = {Occupant behavior},
  keywords = {Data mining},
  keywords = {Occupancy schedule},
  keywords = {Behavioral pattern},
  keywords = {Office building},
  keywords = {Building simulation },
  abstract = {Abstract Building occupancy is a paramount factor in building energy simulations. Specifically, lighting, plug loads, \{HVAC\} equipment utilization, fresh air requirements and internal heat gain or loss greatly depends on the level of occupancy within a building. Developing the appropriate methodologies to describe and reproduce the intricate network responsible for human-building interactions are needed. Extrapolation of patterns from big data streams is a powerful analysis technique which will allow for a better understanding of energy usage in buildings. A three-step data mining framework is applied to discover occupancy patterns in office spaces. First, a data set of 16 offices with 10 min interval occupancy data, over a two year period is mined through a decision tree model which predicts the occupancy presence. Then a rule induction algorithm is used to learn a pruned set of rules on the results from the decision tree model. Finally, a cluster analysis is employed in order to obtain consistent patterns of occupancy schedules. The identified occupancy rules and schedules are representative as four archetypal working profiles that can be used as input to current building energy modeling programs, such as EnergyPlus or IDA-ICE, to investigate impact of occupant presence on design, operation and energy use in office buildings. }
}
@article{Li2015311,
  title = {Identifying emerging hotel preferences using Emerging Pattern Mining technique },
  journal = {Tourism Management },
  volume = {46},
  number = {0},
  pages = {311 - 321},
  year = {2015},
  note = {},
  issn = {0261-5177},
  doi = {http://dx.doi.org/10.1016/j.tourman.2014.06.015},
  url = {http://www.sciencedirect.com/science/article/pii/S0261517714001174},
  author = {Gang Li and Rob Law and Huy Quan Vu and Jia Rong and Xinyuan (Roy) Zhao},
  keywords = {Hotel preference},
  keywords = {Data mining},
  keywords = {Travel behavior},
  keywords = {Emerging pattern mining},
  keywords = {Natural language processing },
  abstract = {Abstract Hotel managers continue to find ways to understand traveler preferences, with the aim of improving their strategic planning, marketing, and product development. Traveler preference is unpredictable; for example, hotel guests used to prefer having a telephone in the room, but now favor fast Internet connection. Changes in preference influence the performance of hotel businesses, thus creating the need to identify and address the demands of their guests. Most existing studies focus on current demand attributes and not on emerging ones. Thus, hotel managers may find it difficult to make appropriate decisions in response to changes in travelers' concerns. To address these challenges, this paper adopts Emerging Pattern Mining technique to identify emergent hotel features of interest to international travelers. Data are derived from 118,000 records of online reviews. The methods and findings can help hotel managers gain insights into travelers' interests, enabling the former to gain a better understanding of the rapid changes in tourist preferences. }
}
@article{Wei201580,
  title = {DF-Miner: Domain-specific facet mining by leveraging the hyperlink structure of Wikipedia },
  journal = {Knowledge-Based Systems },
  volume = {77},
  number = {0},
  pages = {80 - 91},
  year = {2015},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2015.01.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705115000088},
  author = {Bifan Wei and Jun Liu and Qinghua Zheng and Wei Zhang and Chenchen Wang and Bei Wu},
  keywords = {Domain-specific facet mining},
  keywords = {Community structure},
  keywords = {Hyperlink structure},
  keywords = {Scale-free property},
  keywords = {Wikipedia },
  abstract = {Abstract Organizing a set of domain-specific terms into a meaningful hierarchical structure is an essential task for faceted search and knowledge organization. In this paper, we present an automatic approach, called domain-specific facet (DF)-Miner, to discover \{DFs\} based on the hyperlink structure within the Wikipedia article pages. Each article page corresponds to a domain-specific term. The hyperlink structures among article pages represent the connections among these terms. The community structure of the connections among a domain-specific term set reveals the facets of the domain. The terms with more connections provide important clues for facet labeling. Accordingly, DF-Miner first constructs a domain-specific hyperlink graph from the Wikipedia article pages. Then it extracts a tree structure from the Wikipedia category pages. DF-Miner groups the terms of a domain into multiple facets based on the result of community detection. Finally, DF-Miner selects a meaningful label for each facet based on the connection number of terms and the extracted tree structure from the category pages. Two experiments were conducted with six real-world datasets to evaluate DF-Miner. The experimental results show that DF-Miner performs better than the textual content-based approaches. }
}
@article{Nicoletti2013638,
  title = {Mining interests for user profiling in electronic conversations },
  journal = {Expert Systems with Applications },
  volume = {40},
  number = {2},
  pages = {638 - 645},
  year = {2013},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.07.075},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412009311},
  author = {Matias Nicoletti and Silvia Schiaffino and Daniela Godoy},
  keywords = {Topic identification},
  keywords = {Text mining},
  keywords = {Semantic analysis},
  keywords = {Encyclopedia knowledge},
  keywords = {User profiling },
  abstract = {The increasing amount of Web-based tasks is currently requiring personalization strategies to improve the user experience. However, building user profiles is a hard task, since users do not usually give explicit information about their interests. Therefore, interests must be mined implicitly from electronic sources, such as chat and discussion forums. In this work, we present a novel method for topic detection from online informal conversations. Our approach combines: (i) Wikipedia, an extensive source of knowledge, (ii) a concept association strategy, and (iii) a variety of text-mining techniques, such as \{POS\} tagging and named entities recognition. We performed a comparative evaluation procedure for searching the optimal combination of techniques, achieving encouraging results. }
}
@article{He2015116,
  title = {Rockburst laboratory tests database — Application of data mining techniques },
  journal = {Engineering Geology },
  volume = {185},
  number = {0},
  pages = {116 - 130},
  year = {2015},
  note = {},
  issn = {0013-7952},
  doi = {http://dx.doi.org/10.1016/j.enggeo.2014.12.008},
  url = {http://www.sciencedirect.com/science/article/pii/S0013795214003421},
  author = {Manchao He and L. Ribeiro e Sousa and Tiago Miranda and Gualong Zhu},
  keywords = {Rockburst},
  keywords = {Experimental tests},
  keywords = {Data mining},
  keywords = {Rockburst index },
  abstract = {Abstract Rockburst is characterized by a violent explosion of a block causing a sudden rupture in the rock and is quite common in deep tunnels. It is critical to understand the phenomenon of rockburst, focusing on the patterns of occurrence so these events can be avoided and/or managed saving costs and possibly lives. The failure mechanism of rockburst needs to be better understood. Laboratory experiments are undergoing at the Laboratory for Geomechanics and Deep Underground Engineering (SKLGDUE) of Beijing and the system is described. A large number of rockburst tests were performed and their information collected, stored in a database and analyzed. Data Mining (DM) techniques were applied to the database in order to develop predictive models for the rockburst maximum stress (σRB) and rockburst risk index (IRB) that need the results of such tests to be determined. With the developed models it is possible to predict these parameters with high accuracy levels using data from the rock mass and specific project. }
}
@article{Gadia2015111,
  title = {Parallel Text Mining in Multicore Systems Using FP-tree Algorithm },
  journal = {Procedia Computer Science },
  volume = {45},
  number = {0},
  pages = {111 - 117},
  year = {2015},
  note = {International Conference on Advanced Computing Technologies and Applications (ICACTA) },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2015.03.100},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050915003361},
  author = {Krishna Gadia and Kiran Bhowmick},
  keywords = {Parallel FP-Growth},
  keywords = {Data Mining},
  keywords = {Frequent Keywords Mining},
  keywords = {Trend analysis},
  keywords = {Multicore Mining },
  abstract = {Abstract Frequent keyword mining (FKM) is a useful tool for discovering frequently occurring keywords in data. Many algorithms have been developed to speed up mining performance on single core systems. Unfortunately, when the dataset size is huge, both the memory use and computational cost can still be extremely expensive. In our paper, we try to parallelize the FP-Growth algorithm on multicore machines. We partition the huge database, into the number of cores, and utilize the combined strength of all the cores, to achieve maximum performance. We propose to use the generated \{FP\} Tree and its rules for the Trend analysis of news data. }
}
@article{Raju2015102,
  title = {Exploring factors associated with pressure ulcers: A data mining approach },
  journal = {International Journal of Nursing Studies },
  volume = {52},
  number = {1},
  pages = {102 - 111},
  year = {2015},
  note = {},
  issn = {0020-7489},
  doi = {http://dx.doi.org/10.1016/j.ijnurstu.2014.08.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0020748914002053},
  author = {Dheeraj Raju and Xiaogang Su and Patricia A. Patrician and Lori A. Loan and Mary S. McCarthy},
  keywords = {Data mining},
  keywords = {Predictive modeling},
  keywords = {Pressure ulcers},
  keywords = {Braden scale },
  abstract = {AbstractBackground Pressure ulcers are associated with a nearly three-fold increase in in-hospital mortality. It is essential to investigate how other factors besides the Braden scale could enhance the prediction of pressure ulcers. Data mining modeling techniques can be beneficial to conduct this type of analysis. Data mining techniques have been applied extensively in health care, but are not widely used in nursing research. Purpose To remedy this methodological gap, this paper will review, explain, and compare several data mining models to examine patient level factors associated with pressure ulcers based on a four year study from military hospitals in the United States. Methods The variables included in the analysis are easily accessible demographic information and medical measurements. Logistic regression, decision trees, random forests, and multivariate adaptive regression splines were compared based on their performance and interpretability. Results The random forests model had the highest accuracy (C-statistic) with the following variables, in order of importance, ranked highest in predicting pressure ulcers: days in the hospital, serum albumin, age, blood urea nitrogen, and total Braden score. Conclusion Data mining, particularly, random forests are useful in predictive modeling. It is important for hospitals and health care systems to use their own data over time for pressure ulcer risk prediction, to develop risk models based upon more than the total Braden score, and specific to their patient population. }
}
@article{Xintong20147987,
  title = {Brief survey of crowdsourcing for data mining },
  journal = {Expert Systems with Applications },
  volume = {41},
  number = {17},
  pages = {7987 - 7994},
  year = {2014},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2014.06.044},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417414003984},
  author = {Guo Xintong and Wang Hongzhi and Yangqiu Song and Gao Hong},
  keywords = {Data mining},
  keywords = {Crowdsourcing},
  keywords = {Quality control},
  keywords = {Survey },
  abstract = {Abstract Crowdsourcing allows large-scale and flexible invocation of human input for data gathering and analysis, which introduces a new paradigm of data mining process. Traditional data mining methods often require the experts in analytic domains to annotate the data. However, it is expensive and usually takes a long time. Crowdsourcing enables the use of heterogeneous background knowledge from volunteers and distributes the annotation process to small portions of efforts from different contributions. This paper reviews the state-of-the-arts on the crowdsourcing for data mining in recent years. We first review the challenges and opportunities of data mining tasks using crowdsourcing, and summarize the framework of them. Then we highlight several exemplar works in each component of the framework, including question designing, data mining and quality control. Finally, we conclude the limitation of crowdsourcing for data mining and suggest related areas for future research. }
}
@incollection{Lu2015219,
  title = {Chapter 11 - Bioinformatics and Biostatistics in Mining Epigenetic Disease Markers and Targets },
  editor = {Zheng, Y. George },
  booktitle = {Epigenetic Technological Applications },
  publisher = {Academic Press},
  edition = {},
  address = {Boston},
  year = {2015},
  pages = {219 - 244},
  isbn = {978-0-12-801080-8},
  doi = {http://dx.doi.org/10.1016/B978-0-12-801080-8.00011-9},
  url = {http://www.sciencedirect.com/science/article/pii/B9780128010808000119},
  author = {Junyan Lu and Hao Zhang and Liyi Zhang and Cheng Luo},
  keywords = {Epigenetic},
  keywords = {epigenomic},
  keywords = {biomarker},
  keywords = {target},
  keywords = {bioinformatics},
  keywords = {biostatistics},
  keywords = {data mining },
  abstract = {Abnormal epigenetic alterations tightly correlate with human diseases. It is increasingly recognized that epigenetic profiles can be indicators of disease states and epigenetic machinery can be therapeutically targeted. However, discovery of informative epigenetic biomarkers and suitable epigenetic drug targets are daunting tasks, due to the complexity of epigenetic mechanisms. The recent development of high-throughput technologies has led to an explosion of biological data and has enabled mining biomarkers and drug targets in a more systematic way. Bioinformatic and biostatistical approaches are skilled at dealing with large data sets and therefore widely used in mining disease biomarkers and drug targets in this “omic” era. The epigenetic fields are already beginning to benefit from the high-throughput technologies and advanced computational tools. In this chapter, we outline a systematic method of discovery of epigenetic biomarkers and drug targets for human diseases, through adapting bioinformatic and biostatistical approaches. }
}
@article{Vulić2015111,
  title = {Probabilistic topic modeling in multilingual settings: An overview of its methodology and applications },
  journal = {Information Processing & Management },
  volume = {51},
  number = {1},
  pages = {111 - 147},
  year = {2015},
  note = {},
  issn = {0306-4573},
  doi = {http://dx.doi.org/10.1016/j.ipm.2014.08.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0306457314000739},
  author = {Ivan Vulić and Wim De Smet and Jie Tang and Marie-Francine Moens},
  keywords = {Multilingual probabilistic topic models},
  keywords = {Cross-lingual text mining},
  keywords = {Cross-lingual knowledge transfer},
  keywords = {Cross-lingual information retrieval},
  keywords = {Language-independent data representation},
  keywords = {Non-parallel data },
  abstract = {Abstract Probabilistic topic models are unsupervised generative models which model document content as a two-step generation process, that is, documents are observed as mixtures of latent concepts or topics, while topics are probability distributions over vocabulary words. Recently, a significant research effort has been invested into transferring the probabilistic topic modeling concept from monolingual to multilingual settings. Novel topic models have been designed to work with parallel and comparable texts. We define multilingual probabilistic topic modeling (MuPTM) and present the first full overview of the current research, methodology, advantages and limitations in MuPTM. As a representative example, we choose a natural extension of the omnipresent \{LDA\} model to multilingual settings called bilingual \{LDA\} (BiLDA). We provide a thorough overview of this representative multilingual model from its high-level modeling assumptions down to its mathematical foundations. We demonstrate how to use the data representation by means of output sets of (i) per-topic word distributions and (ii) per-document topic distributions coming from a multilingual probabilistic topic model in various real-life cross-lingual tasks involving different languages, without any external language pair dependent translation resource: (1) cross-lingual event-centered news clustering, (2) cross-lingual document classification, (3) cross-lingual semantic similarity, and (4) cross-lingual information retrieval. We also briefly review several other applications present in the relevant literature, and introduce and illustrate two related modeling concepts: topic smoothing and topic pruning. In summary, this article encompasses the current research in multilingual probabilistic topic modeling. By presenting a series of potential applications, we reveal the importance of the language-independent and language pair independent data representations by means of MuPTM. We provide clear directions for future research in the field by providing a systematic overview of how to link and transfer aspect knowledge across corpora written in different languages via the shared space of latent cross-lingual topics, that is, how to effectively employ learned per-topic word distributions and per-document topic distributions of any multilingual probabilistic topic model in various cross-lingual applications. }
}
@article{Petz2014899,
  title = {Computational approaches for mining user’s opinions on the Web 2.0 },
  journal = {Information Processing & Management },
  volume = {50},
  number = {6},
  pages = {899 - 908},
  year = {2014},
  note = {},
  issn = {0306-4573},
  doi = {http://dx.doi.org/10.1016/j.ipm.2014.07.005},
  url = {http://www.sciencedirect.com/science/article/pii/S030645731400065X},
  author = {Gerald Petz and Michał Karpowicz and Harald Fürschuß and Andreas Auinger and Václav Stříteský and Andreas Holzinger},
  keywords = {Opinion mining},
  keywords = {Noisy text},
  keywords = {Text preprocessing},
  keywords = {User generated content},
  keywords = {Data mining },
  abstract = {Abstract The emerging research area of opinion mining deals with computational methods in order to find, extract and systematically analyze people’s opinions, attitudes and emotions towards certain topics. While providing interesting market research information, the user generated content existing on the Web 2.0 presents numerous challenges regarding systematic analysis, the differences and unique characteristics of the various social media channels being one of them. This article reports on the determination of such particularities, and deduces their impact on text preprocessing and opinion mining algorithms. The effectiveness of different algorithms is evaluated in order to determine their applicability to the various social media channels. Our research shows that text preprocessing algorithms are mandatory for mining opinions on the Web 2.0 and that part of these algorithms are sensitive to errors and mistakes contained in the user generated content. }
}
@article{KhadjehNassirtoussi2015306,
  title = {Text mining of news-headlines for \{FOREX\} market prediction: A Multi-layer Dimension Reduction Algorithm with semantics and sentiment },
  journal = {Expert Systems with Applications },
  volume = {42},
  number = {1},
  pages = {306 - 324},
  year = {2015},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2014.08.004},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417414004801},
  author = {Arman Khadjeh Nassirtoussi and Saeed Aghabozorgi and Teh Ying Wah and David Chek Ling Ngo},
  keywords = {News mining},
  keywords = {News semantic analysis},
  keywords = {Market sentiment analysis},
  keywords = {Market prediction},
  keywords = {\{FOREX\} prediction },
  abstract = {Abstract In this paper a novel approach is proposed to predict intraday directional-movements of a currency-pair in the foreign exchange market based on the text of breaking financial news-headlines. The motivation behind this work is twofold: First, although market-prediction through text-mining is shown to be a promising area of work in the literature, the text-mining approaches utilized in it at this stage are not much beyond basic ones as it is still an emerging field. This work is an effort to put more emphasis on the text-mining methods and tackle some specific aspects thereof that are weak in previous works, namely: the problem of high dimensionality as well as the problem of ignoring sentiment and semantics in dealing with textual language. This research assumes that addressing these aspects of text-mining have an impact on the quality of the achieved results. The proposed system proves this assumption to be right. The second part of the motivation is to research a specific market, namely, the foreign exchange market, which seems not to have been researched in the previous works based on predictive text-mining. Therefore, results of this work also successfully demonstrate a predictive relationship between this specific market-type and the textual data of news. Besides the above two main components of the motivation, there are other specific aspects that make the setup of the proposed system and the conducted experiment unique, for example, the use of news article-headlines only and not news article-bodies, which enables usage of short pieces of text rather than long ones; or the use of general financial breaking news without any further filtration. In order to accomplish the above, this work produces a multi-layer algorithm that tackles each of the mentioned aspects of the text-mining problem at a designated layer. The first layer is termed the Semantic Abstraction Layer and addresses the problem of co-reference in text mining that is contributing to sparsity. Co-reference occurs when two or more words in a text corpus refer to the same concept. This work produces a custom approach by the name of Heuristic-Hypernyms Feature-Selection which creates a way to recognize words with the same parent-word to be regarded as one entity. As a result, prediction accuracy increases significantly at this layer which is attributed to appropriate noise-reduction from the feature-space. The second layer is termed Sentiment Integration Layer, which integrates sentiment analysis capability into the algorithm by proposing a sentiment weight by the name of SumScore that reflects investors’ sentiment. Additionally, this layer reduces the dimensions by eliminating those that are of zero value in terms of sentiment and thereby improves prediction accuracy. The third layer encompasses a dynamic model creation algorithm, termed Synchronous Targeted Feature Reduction (STFR). It is suitable for the challenge at hand whereby the mining of a stream of text is concerned. It updates the models with the most recent information available and, more importantly, it ensures that the dimensions are reduced to the absolute minimum. The algorithm and each of its layers are extensively evaluated using real market data and news content across multiple years and have proven to be solid and superior to any other comparable solution. The proposed techniques implemented in the system, result in significantly high directional-accuracies of up to 83.33%. On top of a well-rounded multifaceted algorithm, this work contributes a much needed research framework for this context with a test-bed of data that must make future research endeavors more convenient. The produced algorithm is scalable and its modular design allows improvement in each of its layers in future research. This paper provides ample details to reproduce the entire system and the conducted experiments. }
}
@article{Shanavas2015329,
  title = {Ontology-based Document Mining System for \{IT\} Support Service },
  journal = {Procedia Computer Science },
  volume = {46},
  number = {0},
  pages = {329 - 336},
  year = {2015},
  note = {Proceedings of the International Conference on Information and Communication Technologies, \{ICICT\} 2014, 3-5 December 2014 at Bolgatty Palace & Island Resort, Kochi, India },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2015.02.028},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050915000927},
  author = {Niloofer Shanavas and Shimmi Asokan},
  keywords = {Semantic Retrieval},
  keywords = {Case-based reasoning},
  keywords = {Data Mining},
  keywords = {\{IT\} problem management},
  keywords = {\{IT\} support},
  keywords = {Experience management. },
  abstract = {Abstract Information Technology (IT) is a vital and an integral part of every organization. \{IT\} executives are constantly faced with problems that are difficult to tackle and time consuming. Experience is required to solve these problems easier and faster. We can utilize case-based reasoning (CBR), data mining and information retrieval (IR) techniques to automate \{IT\} problem solving and experience management. In this paper, we propose an \{IT\} ontology-based system for semantic retrieval that increases the efficiency and quality of \{IT\} support service. The proposed approach retrieves similar problem/solution pairs based on the concepts in the query and performs better than the traditional keyword-based approach especially in cases where the keywords of the relevant documents do not match the keywords in the query. }
}
@article{KhadjehNassirtoussi20147653,
  title = {Text mining for market prediction: A systematic review },
  journal = {Expert Systems with Applications },
  volume = {41},
  number = {16},
  pages = {7653 - 7670},
  year = {2014},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2014.06.009},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417414003455},
  author = {Arman Khadjeh Nassirtoussi and Saeed Aghabozorgi and Teh Ying Wah and David Chek Ling Ngo},
  keywords = {Online sentiment analysis},
  keywords = {Social media text mining},
  keywords = {News sentiment analysis},
  keywords = {\{FOREX\} market prediction},
  keywords = {Stock prediction based on news },
  abstract = {Abstract The quality of the interpretation of the sentiment in the online buzz in the social media and the online news can determine the predictability of financial markets and cause huge gains or losses. That is why a number of researchers have turned their full attention to the different aspects of this problem lately. However, there is no well-rounded theoretical and technical framework for approaching the problem to the best of our knowledge. We believe the existing lack of such clarity on the topic is due to its interdisciplinary nature that involves at its core both behavioral-economic topics as well as artificial intelligence. We dive deeper into the interdisciplinary nature and contribute to the formation of a clear frame of discussion. We review the related works that are about market prediction based on online-text-mining and produce a picture of the generic components that they all have. We, furthermore, compare each system with the rest and identify their main differentiating factors. Our comparative analysis of the systems expands onto the theoretical and technical foundations behind each. This work should help the research community to structure this emerging field and identify the exact aspects which require further research and are of special significance. }
}
@article{Assous2014560,
  title = {Mathematical and numerical methods for Vlasov–Maxwell equations: The contribution of data mining },
  journal = {Comptes Rendus Mécanique },
  volume = {342},
  number = {10–11},
  pages = {560 - 569},
  year = {2014},
  note = {Theoretical and numerical approaches for Vlasov-maxwell equations },
  issn = {1631-0721},
  doi = {http://dx.doi.org/10.1016/j.crme.2014.06.010},
  url = {http://www.sciencedirect.com/science/article/pii/S1631072114001491},
  author = {Franck Assous and Joël Chaskalovic},
  keywords = {Data mining},
  keywords = {Error estimate},
  keywords = {Vlasov–Maxwell equations},
  keywords = {Asymptotic analysis},
  keywords = {Paraxial model },
  abstract = {Abstract This paper deals with the applications of data mining techniques in the evaluation of numerical solutions of Vlasov–Maxwell models. This is part of the topic of characterizing the model and approximation errors via learning techniques. We give two examples of application. The first one aims at comparing two Vlasov–Maxwell approximate models. In the second one, a scheme based on data mining techniques is proposed to characterize the errors between a P 1 and a P 2 finite element Particle-In-Cell approach. Beyond these examples, this original approach should operate in all cases where intricate numerical simulations like for the Vlasov–Maxwell equations take a central part. }
}
@article{Bourgeois2015120,
  title = {How to improve robustness in Kohonen maps and display additional information in Factorial Analysis: Application to text mining },
  journal = {Neurocomputing },
  volume = {147},
  number = {0},
  pages = {120 - 135},
  year = {2015},
  note = {Advances in Self-Organizing Maps Subtitle of the special issue: Selected Papers from the Workshop on Self-Organizing Maps 2012 (WSOM 2012) },
  issn = {0925-2312},
  doi = {http://dx.doi.org/10.1016/j.neucom.2013.12.057},
  url = {http://www.sciencedirect.com/science/article/pii/S0925231214007140},
  author = {Nicolas Bourgeois and Marie Cottrell and Benjamin Déruelle and Stéphane Lamassé and Patrick Letrémy},
  keywords = {Text mining},
  keywords = {Kohonen maps},
  keywords = {Factorial Analysis},
  keywords = {Middle ages scientific literature},
  keywords = {Graphs },
  abstract = {Abstract This article is an extended version of a paper presented in the WSOM׳2012 conference (Bourgeois et al., 2012 [1]). We display a combination of factorial projections, \{SOM\} algorithm and graph techniques applied to a text mining problem. The corpus contains eight medieval manuscripts which were used to teach arithmetic techniques to merchants. Among the techniques for Data Analysis, those used for Lexicometry (such as Factorial Analysis) highlight the discrepancies between manuscripts. The reason for this is that they focus on the deviation from the independence between words and manuscripts. Still, we also want to discover and characterize the common vocabulary among the whole corpus. Using the properties of stochastic Kohonen maps, which define neighborhood between inputs in a non-deterministic way, we highlight the words which seem to play a special role in the vocabulary. We call them fickle and use them to improve both Kohonen map robustness and significance of \{FCA\} visualization. Finally we use graph algorithmic to exploit this fickleness for classification of words. }
}
@article{Jin2015126,
  title = {Technology-driven roadmaps for identifying new product/market opportunities: Use of text mining and quality function deployment },
  journal = {Advanced Engineering Informatics },
  volume = {29},
  number = {1},
  pages = {126 - 138},
  year = {2015},
  note = {},
  issn = {1474-0346},
  doi = {http://dx.doi.org/10.1016/j.aei.2014.11.001},
  url = {http://www.sciencedirect.com/science/article/pii/S1474034614001104},
  author = {Gyungmi Jin and Yujin Jeong and Byungun Yoon},
  keywords = {Technology roadmap (TRM)},
  keywords = {Technology-driven approach},
  keywords = {Patent analysis},
  keywords = {Text mining},
  keywords = {Keyword analysis},
  keywords = {Quality function deployment (QFD) },
  abstract = {Abstract A technology roadmap (TRM), an approach that is applied to the development of an emerging technology to meet business goals, is one of the most frequently adopted tools to support the process of technology innovation. Although many studies have dealt with \{TRMs\} that are designed primarily for a market-driven technology planning process, a technology-driven \{TRM\} is far less researched than a market-driven one. Furthermore, approaches to a technology-driven roadmap using quantitative technological information have rarely been studied. Thus, the aim of this research is to propose a new methodological framework to identify both profitable markets and promising product concepts based on technology information. This study suggests two quality function deployment (QFD) matrices to draw up the \{TRM\} in order to find new business opportunities. A case study is presented to illustrate the proposed approach using patents on the solar-lighting devices, which is catching on as a high-tech way to prevent environmental pollution and reduce fuel costs. }
}
@article{Mythily2015619,
  title = {Clustering Models for Data Stream Mining },
  journal = {Procedia Computer Science },
  volume = {46},
  number = {0},
  pages = {619 - 626},
  year = {2015},
  note = {Proceedings of the International Conference on Information and Communication Technologies, \{ICICT\} 2014, 3-5 December 2014 at Bolgatty Palace & Island Resort, Kochi, India },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2015.02.107},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050915001714},
  author = {R. Mythily and Aisha Banu and Shriram Raghunathan},
  keywords = {Data streams},
  keywords = {information retrieval},
  keywords = {data mining },
  abstract = {Abstract The scope of this research is to aggregate news contents that exists in data streams. A data stream may have several research issues. A user may only be interested in a subset of these research issues; there could be many different research issues from multiple streams, which discuss similar topic from different perspectives. A user may be interested in a topic but do not know how to collect all feeds related to this topic. The objective is to cluster all stories in the data streams into hierarchical structure for a better serve to the readers. The work utilizes segment wise distributional clustering that show the effectiveness of data streams. To better serve the news readers, advance data organization is highly desired. Once catching a glimpse of the topic, user can browse the returned hierarchy and find other stories/feeds talking about the same topic in the internet. The dynamically changing of stories needs to use the segment wise distributional clustering algorithm to have the capability to process information incrementally. }
}
@article{Rivera2014128,
  title = {A text mining framework for advancing sustainability indicators },
  journal = {Environmental Modelling & Software },
  volume = {62},
  number = {0},
  pages = {128 - 138},
  year = {2014},
  note = {},
  issn = {1364-8152},
  doi = {http://dx.doi.org/10.1016/j.envsoft.2014.08.016},
  url = {http://www.sciencedirect.com/science/article/pii/S1364815214002400},
  author = {Samuel J. Rivera and Barbara S. Minsker and Daniel B. Work and Dan Roth},
  keywords = {Sustainability indicators},
  keywords = {Text mining},
  keywords = {Informatics},
  keywords = {Knowledge discovery },
  abstract = {Abstract Assessing and tracking sustainability indicators (SI) is challenging because studies are often expensive and time consuming, the resulting indicators are difficult to track, and they usually have limited social input and acceptance, a critical element of sustainability. The central premise of this work is to explore the feasibility of identifying, tracking and reporting \{SI\} by analyzing unstructured digital news articles with text mining methods. Using San Mateo County, California, as a case study, a non-mutually exclusive supervised classification algorithm with natural language processing techniques is applied to analyze sustainability content in news articles and compare the results with \{SI\} reports created by Sustainable San Mateo County (SSMC) using traditional methods. Results showed that the text mining approach could identify all of the indicators highlighted as important in the reports and that the method has potential for identifying region-specific SI, as well as providing insights on the underlying causes of sustainability problems. }
}
@article{Anwar2014349,
  title = {A social graph based text mining framework for chat log investigation },
  journal = {Digital Investigation },
  volume = {11},
  number = {4},
  pages = {349 - 362},
  year = {2014},
  note = {},
  issn = {1742-2876},
  doi = {http://dx.doi.org/10.1016/j.diin.2014.10.001},
  url = {http://www.sciencedirect.com/science/article/pii/S1742287614001091},
  author = {Tarique Anwar and Muhammad Abulaish},
  keywords = {Text mining},
  keywords = {Chat logs mining},
  keywords = {Digital forensics},
  keywords = {Social graph generation},
  keywords = {Cyber crime investigation },
  abstract = {Abstract This paper presents a unified social graph based text mining framework to identify digital evidences from chat logs data. It considers both users' conversation and interaction data in group-chats to discover overlapping users' interests and their social ties. The proposed framework applies n-gram technique in association with a self-customized hyperlink-induced topic search (HITS) algorithm to identify key-terms representing users' interests, key-users, and key-sessions. We propose a social graph generation technique to model users' interactions, where ties (edges) between a pair of users (nodes) are established only if they participate in at least one common group-chat session, and weights are assigned to the ties based on the degree of overlap in users' interests and interactions. Finally, we present three possible cyber-crime investigation scenarios and a user-group identification method for each of them. We present our experimental results on a data set comprising 1100 chat logs of 11,143 chat sessions continued over a period of 29 months from January 2010 to May 2012. Experimental results suggest that the proposed framework is able to identify key-terms, key-users, key-sessions, and user-groups from chat logs data, all of which are crucial for cyber-crime investigation. Though the chat logs are recovered from a single computer, it is very likely that the logs are collected from multiple computers in real scenario. In this case, logs collected from multiple computers can be combined together to generate more enriched social graph. However, our experiments show that the objectives can be achieved even with logs recovered from a single computer by using group-chats data to draw relationships between every pair of users. }
}
@article{MarreseTaylor20147764,
  title = {A novel deterministic approach for aspect-based opinion mining in tourism products reviews },
  journal = {Expert Systems with Applications },
  volume = {41},
  number = {17},
  pages = {7764 - 7775},
  year = {2014},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2014.05.045},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417414003315},
  author = {Edison Marrese-Taylor and Juan D. Velásquez and Felipe Bravo-Marquez},
  keywords = {Aspect-based},
  keywords = {Opinion mining},
  keywords = {Tourism},
  keywords = {Product reviews },
  abstract = {Abstract This work proposes an extension of Bing Liu’s aspect-based opinion mining approach in order to apply it to the tourism domain. The extension concerns with the fact that users refer differently to different kinds of products when writing reviews on the Web. Since Liu’s approach is focused on physical product reviews, it could not be directly applied to the tourism domain, which presents features that are not considered by the model. Through a detailed study of on-line tourism product reviews, we found these features and then model them in our extension, proposing the use of new and more complex NLP-based rules for the tasks of subjective and sentiment classification at the aspect-level. We also entail the task of opinion visualization and summarization and propose new methods to help users digest the vast availability of opinions in an easy manner. Our work also included the development of a generic architecture for an aspect-based opinion mining tool, which we then used to create a prototype and analyze opinions from TripAdvisor in the context of the tourism industry in Los Lagos, a Chilean administrative region also known as the Lake District. Results prove that our extension is able to perform better than Liu’s model in the tourism domain, improving both Accuracy and Recall for the tasks of subjective and sentiment classification. Particularly, the approach is very effective in determining the sentiment orientation of opinions, achieving an F-measure of 92% for the task. However, on average, the algorithms were only capable of extracting 35% of the explicit aspect expressions, using a non-extended approach for this task. Finally, results also showed the effectiveness of our design when applied to solving the industry’s specific issues in the Lake District, since almost 80% of the users that used our tool considered that our tool adds valuable information to their business. }
}
@article{Zhao2014627,
  title = {Comparsion analysis of data mining models applied to clinical research in Traditional Chinese Medicine },
  journal = {Journal of Traditional Chinese Medicine },
  volume = {34},
  number = {5},
  pages = {627 - 634},
  year = {2014},
  note = {},
  issn = {0254-6272},
  doi = {http://dx.doi.org/10.1016/S0254-6272(15)30074-1},
  url = {http://www.sciencedirect.com/science/article/pii/S0254627215300741},
  author = {Yufeng Zhao and Qi Xie and Liyun He and Baoyan Liu and Kun Li and Xiang Zhang and Wenjing Bai and Lin Luo and Xianghong Jing and Ruili Huo},
  keywords = {Medicine, Chinese traditional, Biomedical research},
  keywords = {Data mining},
  keywords = {Model},
  keywords = {Comparison analysis },
  abstract = {AbstractObjective To help researchers selecting appropriate data mining models to provide better evidence for the clinical practice of Traditional Chinese Medicine (TCM) diagnosis and therapy. Methods Clinical issues based on data mining models were comprehensively summarized from four significant elements of the clinical studies: symptoms, symptom patterns, herbs, and efficacy. Existing problems were further generalized to determine the relevant factors of the performance of data mining models, e.g. data type, samples, parameters, variable labels. Combining these relevant factors, the \{TCM\} clinical data features were compared with regards to statistical characters and informatics properties. Data models were compared simultaneously from the view of applied conditions and suitable scopes. Results The main application problems were the inconsistent data type and the small samples for the used data mining models, which caused the inappropriate results, even the mistake results. These features, i.e. advantages, disadvantages, satisfied data types, tasks of data mining, and the \{TCM\} issues, were summarized and compared. Conclusion By aiming at the special features of different data mining models, the clinical doctors could select the suitable data mining models to resolve the \{TCM\} problem. }
}
@article{Tekin2014406,
  title = {A graph mining approach for detecting identical design structures in object-oriented design models },
  journal = {Science of Computer Programming },
  volume = {95, Part 4},
  number = {0},
  pages = {406 - 425},
  year = {2014},
  note = {Special Issue on Software Clones (IWSC'12) },
  issn = {0167-6423},
  doi = {http://dx.doi.org/10.1016/j.scico.2013.09.015},
  url = {http://www.sciencedirect.com/science/article/pii/S0167642313002451},
  author = {Umut Tekin and Feza Buzluca},
  keywords = {Software design models},
  keywords = {Identical design structures},
  keywords = {Software motifs},
  keywords = {Pattern extraction},
  keywords = {Graph mining },
  abstract = {Abstract The object-oriented approach has been the most popular software design methodology for the past twenty-five years. Several design patterns and principles are defined to improve the design quality of object-oriented software systems. In addition, designers can use unique design motifs that are designed for the specific application domains. Another commonly used technique is cloning and modifying some parts of the software while creating new modules. Therefore, object-oriented programs can include many identical design structures. This work proposes a sub-graph mining-based approach for detecting identical design structures in object-oriented systems. By identifying and analyzing these structures, we can obtain useful information about the design, such as commonly-used design patterns, most frequent design defects, domain-specific patterns, and reused design clones, which could help developers to improve their knowledge about the software architecture. Furthermore, problematic parts of frequent identical design structures are appropriate refactoring opportunities because they affect multiple areas of the architecture. Experiments with several open-source and industrial projects show that we can successfully find many identical design structures within a project (intra-project) and between different projects (inter-project). We observe that usually most of the detected identical structures are an implementation of common design patterns; however, we also detect various anti-patterns, domain-specific patterns, reused design parts and design-level clones. }
}
@article{Sedrakyan2014486,
  title = {Process mining analysis of conceptual modeling behavior of novices – empirical study using \{JMermaid\} modeling and experimental logging environment },
  journal = {Computers in Human Behavior },
  volume = {41},
  number = {0},
  pages = {486 - 503},
  year = {2014},
  note = {},
  issn = {0747-5632},
  doi = {http://dx.doi.org/10.1016/j.chb.2014.09.054},
  url = {http://www.sciencedirect.com/science/article/pii/S0747563214005123},
  author = {Gayane Sedrakyan and Monique Snoeck and Jochen De Weerdt},
  keywords = {Teaching/learning conceptual modeling},
  keywords = {Process-oriented feedback},
  keywords = {Conceptual modeling pattern},
  keywords = {Information systems education},
  keywords = {Process mining},
  keywords = {Learning data analytics },
  abstract = {Abstract Previous studies on learning challenges in the field of modeling focus on cognitive perspectives, such as model understanding, modeling language knowledge and perceptual properties of graphical notation by novice business analysts as major sources affecting model quality. In the educational context outcome feedback is usually applied to improve learning achievements. However, not many research publications have been written observing the characteristics of a modeling process itself that can be associated with better/worse learning outcomes, nor have any empirically validated results been reported on the observations of modeling activities in the educational context. This paper attempts to cover this gap for conceptual modeling. We analyze modeling behavior (conceptual modeling event data of 20 cases, 10.000 events in total) using experimental logging functionality of the \{JMermaid\} modeling tool and process mining techniques. The outcomes of the work include modeling patterns that are indicative for worse/better learning performance. The results contribute to (1) improving teaching guidance for conceptual modeling targeted at process-oriented feedback, (2) providing recommendations on the type of data that can be useful in observing a modeling behavior from the perspective of learning outcomes. In addition, the study provides first insights for learning analytics research in the domain of conceptual modeling. }
}
@article{PeñalverMartinez20145995,
  title = {Feature-based opinion mining through ontologies },
  journal = {Expert Systems with Applications },
  volume = {41},
  number = {13},
  pages = {5995 - 6008},
  year = {2014},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2014.03.022},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417414001511},
  author = {Isidro Peñalver-Martinez and Francisco Garcia-Sanchez and Rafael Valencia-Garcia and Miguel Ángel Rodríguez-García and Valentín Moreno and Anabel Fraga and Jose Luis Sánchez-Cervantes},
  keywords = {Opinion mining},
  keywords = {Ontology},
  keywords = {Sentiment analysis},
  keywords = {Feature extraction},
  keywords = {Part of speech tagging},
  keywords = {Polarity identification },
  abstract = {Abstract The idiosyncrasy of the Web has, in the last few years, been altered by Web 2.0 technologies and applications and the advent of the so-called Social Web. While users were merely information consumers in the traditional Web, they play a much more active role in the Social Web since they are now also data providers. The mass involved in the process of creating Web content has led many public and private organizations to focus their attention on analyzing this content in order to ascertain the general public’s opinions as regards a number of topics. Given the current Web size and growth rate, automated techniques are essential if practical and scalable solutions are to be obtained. Opinion mining is a highly active research field that comprises natural language processing, computational linguistics and text analysis techniques with the aim of extracting various kinds of added-value and informational elements from users’ opinions. However, current opinion mining approaches are hampered by a number of drawbacks such as the absence of semantic relations between concepts in feature search processes or the lack of advanced mathematical methods in sentiment analysis processes. In this paper we propose an innovative opinion mining methodology that takes advantage of new Semantic Web-guided solutions to enhance the results obtained with traditional natural language processing techniques and sentiment analysis processes. The main goals of the proposed methodology are: (1) to improve feature-based opinion mining by using ontologies at the feature selection stage, and (2) to provide a new vector analysis-based method for sentiment analysis. The methodology has been implemented and thoroughly tested in a real-world movie review-themed scenario, yielding very promising results when compared with other conventional approaches. }
}
@incollection{Talabis2015123,
  title = {Chapter 6 - Security and Text Mining },
  editor = {Kaye, Mark Ryan M. TalabisRobert McPhersonI. MiyamotoJason L. MartinD. },
  booktitle = {Information Security Analytics },
  publisher = {Syngress},
  edition = {},
  address = {Boston},
  year = {2015},
  pages = {123 - 150},
  isbn = {978-0-12-800207-0},
  doi = {http://dx.doi.org/10.1016/B978-0-12-800207-0.00006-X},
  url = {http://www.sciencedirect.com/science/article/pii/B978012800207000006X},
  author = {Mark Ryan M. Talabis and Robert McPherson and I. Miyamoto and Jason L. Martin and D. Kaye},
  keywords = {Big data},
  keywords = {CRAN},
  keywords = {Hadoop},
  keywords = {Hive},
  keywords = {R},
  keywords = {Stop word list},
  keywords = {Text mining},
  keywords = {Token},
  keywords = {Unstructured text },
  abstract = {Abstract Massive amounts of unstructured data are being collected from online sources, such as e-mails, call center transcripts, wikis, online bulletin boards, blogs, tweets, Web pages, and so on. The R programming language contains a rich collection of packages and functions for analyzing unstructured text data. Functions include those for identifying unique words and their corresponding occurrence frequencies, a process known as tokenizing. Other functions provide a means for cleansing text data, such as removal of white space and punctuation, converting to lowercase, and removing less meaningful words through a stop word list. Apache Hive functions also provide the means for tokenizing large amounts of text using the Hadoop MapReduce framework. Text data that have been reduced to a more manageable size using Hive functions can be further analyzed using the R language's vast array text processing and advanced analytical functions. }
}
@article{Majid201566,
  title = {A system for mining interesting tourist locations and travel sequences from public geo-tagged photos },
  journal = {Data & Knowledge Engineering },
  volume = {95},
  number = {0},
  pages = {66 - 86},
  year = {2015},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2014.11.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X14000962},
  author = {Abdul Majid and Ling Chen and Hamid Turab Mirza and Ibrar Hussain and Gencai Chen},
  keywords = {Geo-referenced photographs},
  keywords = {Trip planning},
  keywords = {Context-aware query},
  keywords = {Travel sequence},
  keywords = {Spatio-temporal data mining },
  abstract = {Abstract Geo-tagged photos of users on social media sites (e.g., Flickr) provide plentiful location-based data. This data provide a wealth of information about user behaviours and their potential is increasing, as it becomes ever-more common for images to be associated with location information in the form of geo-tags. Recently, there is an increasing tendency to adopt the information from these geo-tagged photos for learning to recommend tourist locations. In this paper, we aim to propose a system to recommend interesting tourist locations and interesting tourist travel sequences (i.e., sequence of tourist locations) from a collection of geo-tagged photos. Proposed system is capable of understanding context (i.e., time, date, and weather), as well as taking into account the collective wisdom of people, to make tourist recommendations. We illustrate our technique on a sample of public Flickr data set. Experimental results demonstrate that the proposed approach is able to generate better recommendations as compared to other state-of-the-art landmark based recommendation methods. }
}
@incollection{Trovati2015155,
  title = {Chapter 11 - Mining Social Media: Architecture, Tools, and Approaches to Detecting Criminal Activity },
  editor = {Akhgar, Babak and Arabnia, Gregory B. SaathoffHamid R. and Staniforth, Richard HillAndrew  and Bayerl, Petra Saskia },
  booktitle = {Application of Big Data for National Security },
  publisher = {Butterworth-Heinemann},
  edition = {},
  address = {},
  year = {2015},
  pages = {155 - 172},
  isbn = {978-0-12-801967-2},
  doi = {http://dx.doi.org/10.1016/B978-0-12-801967-2.00011-2},
  url = {http://www.sciencedirect.com/science/article/pii/B9780128019672000112},
  author = {Marcello Trovati},
  keywords = {Bayesian networks},
  keywords = {Data mining},
  keywords = {Information retrieval},
  keywords = {Social media},
  keywords = {Text mining },
  abstract = {Abstract Social media have been increasingly used as a means of information gathering and sharing in criminal activities. The ability to expose and analyze any such activity is at the core of crime detection, providing a variety of techniques and platforms that facilitate the decision-making process in this field. }
}
@article{Zihayat2014138,
  title = {Mining top-k high utility patterns over data streams },
  journal = {Information Sciences },
  volume = {285},
  number = {0},
  pages = {138 - 161},
  year = {2014},
  note = {Processing and Mining Complex Data Streams },
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2014.01.045},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025514000814},
  author = {Morteza Zihayat and Aijun An},
  keywords = {High utility pattern mining},
  keywords = {Data stream},
  keywords = {Top-k pattern mining},
  keywords = {Sliding window },
  abstract = {Abstract Online high utility itemset mining over data streams has been studied recently. However, the existing methods are not designed for producing top-k patterns. Since there could be a large number of high utility patterns, finding only top-k patterns is more attractive than producing all the patterns whose utility is above a threshold. A challenge with finding top-k high utility itemsets over data streams is that it is not easy for users to determine a proper minimum utility threshold in order for the method to work efficiently. In this paper, we propose a new method (named T-HUDS) for finding top-k high utility patterns over sliding windows of a data stream. The method is based on a compressed tree structure, called HUDS-tree, that can be used to efficiently find potential top-k high utility itemsets over sliding windows. T-HUDS uses a new utility estimation model to more effectively prune the search space. We also propose several strategies for initializing and dynamically adjusting the minimum utility threshold. We prove that no top-k high utility itemset is missed by the proposed method. Our experimental results on real and synthetic datasets show that our strategies and new utility estimation model work very effectively and that T-HUDS outperforms two state-of-the-art high utility itemset algorithms substantially in terms of execution time and memory storage. }
}
@article{Guerrero2014376,
  title = {Improving Knowledge-Based Systems with statistical techniques, text mining, and neural networks for non-technical loss detection },
  journal = {Knowledge-Based Systems },
  volume = {71},
  number = {0},
  pages = {376 - 388},
  year = {2014},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2014.08.014},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705114003025},
  author = {Juan I. Guerrero and Carlos León and Iñigo Monedero and Félix Biscarri and Jesús Biscarri},
  keywords = {Expert system},
  keywords = {Power distribution},
  keywords = {Non-technical losses},
  keywords = {Neural network},
  keywords = {Text mining },
  abstract = {Abstract Currently, power distribution companies have several problems that are related to energy losses. For example, the energy used might not be billed due to illegal manipulation or a breakdown in the customer’s measurement equipment. These types of losses are called non-technical losses (NTLs), and these losses are usually greater than the losses that are due to the distribution infrastructure (technical losses). Traditionally, a large number of studies have used data mining to detect NTLs, but to the best of our knowledge, there are no studies that involve the use of a Knowledge-Based System (KBS) that is created based on the knowledge and expertise of the inspectors. In the present study, a \{KBS\} was built that is based on the knowledge and expertise of the inspectors and that uses text mining, neural networks, and statistical techniques for the detection of NTLs. Text mining, neural networks, and statistical techniques were used to extract information from samples, and this information was translated into rules, which were joined to the rules that were generated by the knowledge of the inspectors. This system was tested with real samples that were extracted from Endesa databases. Endesa is one of the most important distribution companies in Spain, and it plays an important role in international markets in both Europe and South America, having more than 73 million customers. }
}
@article{Ma201413,
  title = {Mining hidden links in social networks to achieve equilibrium },
  journal = {Theoretical Computer Science },
  volume = {556},
  number = {0},
  pages = {13 - 24},
  year = {2014},
  note = {Combinatorial Optimization and Applications },
  issn = {0304-3975},
  doi = {http://dx.doi.org/10.1016/j.tcs.2014.08.006},
  url = {http://www.sciencedirect.com/science/article/pii/S0304397514006136},
  author = {Huan Ma and Zaixin Lu and Deying Li and Yuqing Zhu and Lidan Fan and Weili Wu},
  keywords = {Mining hidden links},
  keywords = {Vertex pair proximity},
  keywords = {Approximation algorithm},
  keywords = {Nash equilibrium},
  keywords = {Social networks },
  abstract = {Abstract Although more connections between individuals in a social network can be identified with the development of high techniques, to obtain the complete relation information between individuals is still hard due to complex structure and individual privacy. However, the social networks have communities. In our work, we aim at mining the invisible or missing relations between individuals within a community in social networks. We propose our algorithm according to the fact that the individuals exist in communities satisfying Nash equilibrium, which is borrowed from game-theoretic concepts often used in economic researches. Each hidden relation is explored through the individual's loyalty to their community. To the best of our knowledge, this is the first work that studies the problem of mining hidden links from the aspect of Nash equilibrium. Eventually we confirm our approach's superiority from extensive experiments over real-world social networks. }
}
@article{Lee20129623,
  title = {Mining spatio-temporal information on microblogging streams using a density-based online clustering method },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {10},
  pages = {9623 - 9641},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.02.136},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412003995},
  author = {Chung-Hong Lee},
  keywords = {Topic detection},
  keywords = {Text mining},
  keywords = {Microblogging},
  keywords = {Temporal analysis},
  keywords = {Spatial analysis },
  abstract = {Social networks have been regarded as a timely and cost-effective source of spatio-temporal information for many fields of application. However, while some research groups have successfully developed topic detection methods from the text streams for a while, and even some popular microblogging services such as Twitter did provide information of top trending topics for selection, it is still unable to fully support users for picking up all of the real-time event topics with a comprehensive spatio-temporal viewpoint to satisfy their information needs. This paper aims to investigate how microblogging social networks (i.e. Twitter) can be used as a reliable information source of emerging events by extracting their spatio-temporal features from the messages to enhance event awareness. In this work, we applied a density-based online clustering method for mining microblogging text streams, in order to obtain temporal and geospatial features of real-world events. By analyzing the events detected by our system, the temporal and spatial impacts of the emerging events can be estimated, for achieving the goals of situational awareness and risk management. }
}
@article{Jang2014638,
  title = {A review of soft computing technology applications in several mining problems },
  journal = {Applied Soft Computing },
  volume = {22},
  number = {0},
  pages = {638 - 651},
  year = {2014},
  note = {},
  issn = {1568-4946},
  doi = {http://dx.doi.org/10.1016/j.asoc.2014.05.019},
  url = {http://www.sciencedirect.com/science/article/pii/S1568494614002464},
  author = {Hyongdoo Jang and Erkan Topal},
  keywords = {Soft computing},
  keywords = {Mining method selection},
  keywords = {Mining equipment selection},
  keywords = {Rock mechanics},
  keywords = {Blasting },
  abstract = {Abstract Soft computing (SC) is a field of computer science that resembles the processes of the human brain. While conventional hard computing is run based on crisp values and binary numbers, \{SC\} uses soft values and fuzzy sets. In fact, \{SC\} technology is capable of address imprecision and uncertainty. The application of \{SC\} techniques in the mining industry is fairly extensive and covers a considerable number of applications. This paper provides a comprehensive overview of the published work on \{SC\} applications in different mining areas. A brief introduction to mining and the general field of \{SC\} applications are presented in the first section of the paper. The second section comprises four review chapters. Mining method selection, equipment selection problems and their applications in \{SC\} technologies are presented in chapters one and two. Chapter three discusses rock mechanics-related subjects and some of representative \{SC\} applications in this field. The last chapter presents rock blasting related \{SC\} applications that include blast design and hazards. The final section of the paper comments on the use of \{SC\} applications in several mining problems and possible future applications of advanced \{SC\} technologies. }
}
@article{Tang20147805,
  title = {Mining language variation using word using and collocation characteristics },
  journal = {Expert Systems with Applications },
  volume = {41},
  number = {17},
  pages = {7805 - 7819},
  year = {2014},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2014.05.018},
  url = {http://www.sciencedirect.com/science/article/pii/S095741741400298X},
  author = {Peng Tang and Tommy W.S. Chow},
  keywords = {Language variation},
  keywords = {Text mining},
  keywords = {Frequency Rank Ratio},
  keywords = {Overall Intimacy },
  abstract = {Abstract Two textual metrics “Frequency Rank” (FR) and “Intimacy” are proposed in this paper to measure the word using and collocation characteristics which are two important aspects of text style. The FR, derived from the local index numbers of terms in a sentences ordered by the global frequency of terms, provides single-term-level information. The Intimacy models relationship between a word and others, i.e. the closeness a term is to other terms in the same sentence. Two textual features “Frequency Rank Ratio (FRR)” and “Overall Intimacy (OI)” for capturing language variation are derived by employing the two proposed textual metrics. Using the derived features, language variation among documents can be visualized in a text space. Three corpora consisting of documents of diverse topics, genres, regions, and dates of writing are designed and collected to evaluate the proposed algorithms. Extensive simulations are conducted to verify the feasibility and performance of our implementation. Both theoretical analyses based on entropy and the simulations demonstrate the feasibility of our method. We also show the proposed algorithm can be used for visualizing the closeness of several western languages. Variation of modern English over time is also recognizable when using our analysis method. Finally, our method is compared to conventional text classification implementations. The comparative results indicate our method outperforms the others. }
}
@article{Deng20141763,
  title = {Fast mining Top-Rank-k frequent patterns by using Node-lists },
  journal = {Expert Systems with Applications },
  volume = {41},
  number = {4, Part 2},
  pages = {1763 - 1768},
  year = {2014},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2013.08.075},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417413006969},
  author = {Zhi-Hong Deng},
  keywords = {Data mining},
  keywords = {Pattern mining},
  keywords = {Top-Rank-k frequent patterns},
  keywords = {Node-list},
  keywords = {Algorithm },
  abstract = {Abstract Mining Top-Rank-k frequent patterns is an emerging topic in frequent pattern mining in recent years. In this paper, we propose a new mining algorithm, NTK, to mining Top-Rank-k frequent patterns. The \{NTK\} algorithm employs a data structure, Node-list, to represent patterns. The Node-list structure makes the mining process much efficient. We have experimentally evaluated our algorithm against two representative algorithms on four real datasets. The experimental results show that the \{NTK\} algorithm is efficient and is at least two orders of magnitude faster than the \{FAE\} algorithm and also remarkably faster than the \{VTK\} algorithm, the recently reported state-of-the-art algorithm for mining Top-Rank-k frequent patterns. }
}
@article{Li2014114,
  title = {An ontology-based Web mining method for unemployment rate prediction },
  journal = {Decision Support Systems },
  volume = {66},
  number = {0},
  pages = {114 - 122},
  year = {2014},
  note = {},
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/j.dss.2014.06.007},
  url = {http://www.sciencedirect.com/science/article/pii/S016792361400181X},
  author = {Ziang Li and Wei Xu and Likuan Zhang and Raymond Y.K. Lau},
  keywords = {Unemployment rate prediction},
  keywords = {Search engine query data},
  keywords = {Domain ontology},
  keywords = {Web mining},
  keywords = {Neural networks},
  keywords = {Support vector regressions },
  abstract = {Abstract Unemployment rate is one of the most critical economic indicators. By analyzing and predicting unemployment rate, government officials can develop appropriate labor market related policies in response to the current economic situation. Accordingly, unemployment rate prediction has attracted a lot of attention from researchers in recent years. The main contribution of this paper is the illustration of a novel ontology-based Web mining framework that leverages search engine queries to improve the accuracy of unemployment rate prediction. The proposed framework is underpinned by a domain ontology which captures unemployment related concepts and their semantic relationships to facilitate the extraction of useful prediction features from relevant search engine queries. In addition, state-of-the-art feature selection methods and data mining models such as neural networks and support vector regressions are exploited to enhance the effectiveness of unemployment rate prediction. Our experimental results show that the proposed framework outperforms other baseline forecasting approaches that have been widely used for unemployment rate prediction. Our empirical findings also confirm that domain ontology and search engine queries can be exploited to improve the effectiveness of unemployment rate prediction. A unique advantage of the proposed framework is that it not only improves prediction performance but also provides human comprehensible explanations for the changes of unemployment rate. The business implication of our research work is that government officials and human resources managers can utilize the proposed framework to effectively analyze unemployment rate, and hence to better develop labor market related policies. }
}
@article{Khan2014258,
  title = {Mining opinion components from unstructured reviews: A review },
  journal = {Journal of King Saud University - Computer and Information Sciences },
  volume = {26},
  number = {3},
  pages = {258 - 275},
  year = {2014},
  note = {},
  issn = {1319-1578},
  doi = {http://dx.doi.org/10.1016/j.jksuci.2014.03.009},
  url = {http://www.sciencedirect.com/science/article/pii/S131915781400010X},
  author = {Khairullah Khan and Baharum Baharudin and Aurnagzeb Khan and Ashraf Ullah},
  keywords = {Opinion mining},
  keywords = {Sentiment analysis},
  keywords = {Information retrieval},
  keywords = {Text mining},
  keywords = {Web mining },
  abstract = {Abstract Opinion mining is an interesting area of research because of its applications in various fields. Collecting opinions of people about products and about social and political events and problems through the Web is becoming increasingly popular every day. The opinions of users are helpful for the public and for stakeholders when making certain decisions. Opinion mining is a way to retrieve information through search engines, Web blogs and social networks. Because of the huge number of reviews in the form of unstructured text, it is impossible to summarize the information manually. Accordingly, efficient computational methods are needed for mining and summarizing the reviews from corpuses and Web documents. This study presents a systematic literature survey regarding the computational techniques, models and algorithms for mining opinion components from unstructured reviews. }
}
@article{Jeong2014776,
  title = {Time gap analysis by the topic model-based temporal technique },
  journal = {Journal of Informetrics },
  volume = {8},
  number = {3},
  pages = {776 - 790},
  year = {2014},
  note = {},
  issn = {1751-1577},
  doi = {http://dx.doi.org/10.1016/j.joi.2014.07.005},
  url = {http://www.sciencedirect.com/science/article/pii/S1751157714000650},
  author = {Do-Heon Jeong and Min Song},
  keywords = {Text mining},
  keywords = {Topic modeling},
  keywords = {Latent Dirichlet Allocation (LDA)},
  keywords = {Content analysis},
  keywords = {Temporal analysis},
  keywords = {Multiple resources },
  abstract = {Abstract This study proposes a temporal analysis method to utilize heterogeneous resources such as papers, patents, and web news articles in an integrated manner. We analyzed the time gap phenomena between three resources and two academic areas by conducting text mining-based content analysis. To this end, a topic modeling technique, Latent Dirichlet Allocation (LDA) was used to estimate the optimal time gaps among three resources (papers, patents, and web news articles) in two research domains. The contributions of this study are summarized as follows: firstly, we propose a new temporal analysis method to understand the content characteristics and trends of heterogeneous multiple resources in an integrated manner. We applied it to measure the exact time intervals between academic areas by understanding the time gap phenomena. The results of temporal analysis showed that the resources of the medical field had more up-to-date property than those of the computer field, and thus prompter disclosure to the public. Secondly, we adopted a power-law exponent measurement and content analysis to evaluate the proposed method. With the proposed method, we demonstrate how to analyze heterogeneous resources more precisely and comprehensively. }
}
@article{Lara2014219,
  title = {A general framework for time series data mining based on event analysis: Application to the medical domains of electroencephalography and stabilometry },
  journal = {Journal of Biomedical Informatics },
  volume = {51},
  number = {0},
  pages = {219 - 241},
  year = {2014},
  note = {},
  issn = {1532-0464},
  doi = {http://dx.doi.org/10.1016/j.jbi.2014.06.003},
  url = {http://www.sciencedirect.com/science/article/pii/S1532046414001415},
  author = {Juan A. Lara and David Lizcano and Aurora Pérez and Juan P. Valente},
  keywords = {Medical data mining},
  keywords = {Time series analysis},
  keywords = {Event},
  keywords = {Classification},
  keywords = {Electroencephalography},
  keywords = {Stabilometry },
  abstract = {Abstract There are now domains where information is recorded over a period of time, leading to sequences of data known as time series. In many domains, like medicine, time series analysis requires to focus on certain regions of interest, known as events, rather than analyzing the whole time series. In this paper, we propose a framework for knowledge discovery in both one-dimensional and multidimensional time series containing events. We show how our approach can be used to classify medical time series by means of a process that identifies events in time series, generates time series reference models of representative events and compares two time series by analyzing the events they have in common. We have applied our framework on time series generated in the areas of electroencephalography (EEG) and stabilometry. Framework performance was evaluated in terms of classification accuracy, and the results confirmed that the proposed schema has potential for classifying \{EEG\} and stabilometric signals. The proposed framework is useful for discovering knowledge from medical time series containing events, such as stabilometric and electroencephalographic time series. These results would be equally applicable to other medical domains generating iconographic time series, such as, for example, electrocardiography (ECG). }
}
@article{Nguyen20144716,
  title = {Efficient strategies for parallel mining class association rules },
  journal = {Expert Systems with Applications },
  volume = {41},
  number = {10},
  pages = {4716 - 4729},
  year = {2014},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2014.01.038},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417414000621},
  author = {Dang Nguyen and Bay Vo and Bac Le},
  keywords = {Associative classification},
  keywords = {Class association rule mining},
  keywords = {Parallel computing},
  keywords = {Data mining},
  keywords = {Multi-core processor },
  abstract = {Abstract Mining class association rules (CARs) is an essential, but time-intensive task in Associative Classification (AC). A number of algorithms have been proposed to speed up the mining process. However, sequential algorithms are not efficient for mining \{CARs\} in large datasets while existing parallel algorithms require communication and collaboration among computing nodes which introduces the high cost of synchronization. This paper addresses these drawbacks by proposing three efficient approaches for mining \{CARs\} in large datasets relying on parallel computing. To date, this is the first study which tries to implement an algorithm for parallel mining \{CARs\} on a computer with the multi-core processor architecture. The proposed parallel algorithm is theoretically proven to be faster than existing parallel algorithms. The experimental results also show that our proposed parallel algorithm outperforms a recent sequential algorithm in mining time. }
}
@article{Anwar2014123,
  title = {Namesake alias mining on the Web and its role towards suspect tracking },
  journal = {Information Sciences },
  volume = {276},
  number = {0},
  pages = {123 - 145},
  year = {2014},
  note = {},
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2014.02.050},
  url = {http://www.sciencedirect.com/science/article/pii/S002002551400156X},
  author = {Tarique Anwar and Muhammad Abulaish},
  keywords = {Web content mining},
  keywords = {Web people search},
  keywords = {Alias mining},
  keywords = {Namesake disambiguation},
  keywords = {Clustering },
  abstract = {Abstract With the proliferation of social media, the number of active web-users is rapidly increasing these days. They create and maintain their personal web-profiles, and use them to interact with others in the cyber-space. Currently two major problems are being faced to automatically identify these web-users and correlate their web-profiles. First is the presence of namesakes on the Web, and the second is the use of alias names. In this paper, we propose a context-based text mining approach to discover alias names for all the namesakes sharing a common name on the Web, and leave the task of selecting the namesake of interest on part of the user. The proposed method employs a search-engine \{API\} to retrieve relevant webpages for a given name. The retrieved webpages are modeled into a graph, and a clustering algorithm is applied to disambiguate the webpages. Thereafter each obtained cluster standing for a namesake is mined for alias identification following a text pattern based statistical technique. The existing research works do not consider the presence of namesakes on the Web to mine aliases, which is impractical. The novelty of the proposed approach lies in discovering this drawback of existing works. Additionally the contribution includes the disambiguation technique that does not need to have a pre-determined number of clusters to be generated and the light-weight text pattern based alias mining technique. The number of clusters in the proposed method is rather determined dynamically by the inflation parameter, the pre-determination of which is comparatively much easier. Experimental results on different components demonstrate the robustness of the proposed alias mining approach. This paper also brings forth the significance of alias mining to the problem of suspect monitoring and tracking on the Web. }
}
@article{Deng20144505,
  title = {Fast mining frequent itemsets using Nodesets },
  journal = {Expert Systems with Applications },
  volume = {41},
  number = {10},
  pages = {4505 - 4512},
  year = {2014},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2014.01.025},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417414000463},
  author = {Zhi-Hong Deng and Sheng-Long Lv},
  keywords = {Data mining},
  keywords = {Frequent itemset mining},
  keywords = {Nodesets},
  keywords = {Algorithm},
  keywords = {Performance },
  abstract = {Abstract Node-list and N-list, two novel data structure proposed in recent years, have been proven to be very efficient for mining frequent itemsets. The main problem of these structures is that they both need to encode each node of a PPC-tree with pre-order and post-order code. This causes that they are memory-consuming and inconvenient to mine frequent itemsets. In this paper, we propose Nodeset, a more efficient data structure, for mining frequent itemsets. Nodesets require only the pre-order (or post-order code) of each node, which makes it saves half of memory compared with N-lists and Node-lists. Based on Nodesets, we present an efficient algorithm called \{FIN\} to mining frequent itemsets. For evaluating the performance of FIN, we have conduct experiments to compare it with PrePost and FP-growth∗, two state-of-the-art algorithms, on a variety of real and synthetic datasets. The experimental results show that \{FIN\} is high performance on both running time and memory usage. }
}
@article{Lin201488,
  title = {Maintenance of prelarge trees for data mining with modified records },
  journal = {Information Sciences },
  volume = {278},
  number = {0},
  pages = {88 - 103},
  year = {2014},
  note = {},
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2014.03.023},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025514003077},
  author = {Chun-Wei Lin and Tzung-Pei Hong},
  keywords = {Data mining},
  keywords = {FP-tree},
  keywords = {Prelarge tree},
  keywords = {Pre-large itemset},
  keywords = {Record modification},
  keywords = {Maintenance },
  abstract = {Abstract The frequent pattern tree (FP-tree) is an efficient data structure for association-rule mining without generation of candidate itemsets. It is used to compress a database into a tree structure which stores only large items. When data are modified, it, however, needs to process all transactions in a batch way. In the past, the prelarge-tree structure was proposed to incrementally mine association rules efficiently. In this paper, we propose an algorithm to maintain this structure when records in an original database are modified. The proposed maintenance algorithm is based on the pre-large concepts, which are defined by a lower support threshold and an upper support threshold. Due to the pruning properties of pre-large concepts, the proposed approach can reduce the rescan number of an original database when records are modified. It can thus obtain good execution performance for pre-large tree maintenance, especially when each time a small number of records are modified. Although experimental results show that the proposed prelarge-tree maintenance algorithm has good performance for handling modified records, the proposed algorithm needs to maintain nodes of pre-large items in the tree structure. This is the additional overhead, which is a trade-off between execution time and tree complexity. }
}
@article{Vimieiro2014171,
  title = {Disclosed: An efficient depth-first, top-down algorithm for mining disjunctive closed itemsets in high-dimensional data },
  journal = {Information Sciences },
  volume = {280},
  number = {0},
  pages = {171 - 187},
  year = {2014},
  note = {},
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2014.04.044},
  url = {http://www.sciencedirect.com/science/article/pii/S002002551400509X},
  author = {Renato Vimieiro and Pablo Moscato},
  keywords = {Disjunctive itemset},
  keywords = {Frequent itemset mining},
  keywords = {Closed itemset},
  keywords = {Rare itemset},
  keywords = {Formal concept analysis},
  keywords = {Microarray data },
  abstract = {Abstract We focus, in this paper, on the computational challenges of identifying disjunctive Boolean patterns in high-dimensional data. We conduct our analysis focusing particularly in microarray gene expression data, since this is one of the most stereotypical examples of high-dimensional data. We devised a novel algorithm that takes advantage of the scarcity of samples in microarray data sets, allowing us to efficiently find disjunctive closed patterns. Our algorithm, Disclosed, mines disjunctive closed itemsets by exploring the search space in a depth-first, top-down manner. We evaluated the performance of our algorithm to execute such a task using real microarray gene expression data sets publicly available on the Internet. Our experiments revealed under what situations, the characteristics of a data set, our method obtain a good, bad or average performance. We also compared the performance of our method with the state of the art algorithms for finding disjunctive closed patterns and disjunctive minimal generators. We observed that our approach is two orders of magnitude more efficient, both in terms of time and memory. }
}
@article{Zheng201429,
  title = {Incorporating appraisal expression patterns into topic modeling for aspect and sentiment word identification },
  journal = {Knowledge-Based Systems },
  volume = {61},
  number = {0},
  pages = {29 - 47},
  year = {2014},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2014.02.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705114000434},
  author = {Xiaolin Zheng and Zhen Lin and Xiaowei Wang and Kwei-Jay Lin and Meina Song},
  keywords = {Opinion mining},
  keywords = {Appraisal expression pattern},
  keywords = {Dependency analysis},
  keywords = {Topic modeling},
  keywords = {Aspect and sentiment analysis },
  abstract = {Abstract With the considerable growth of user-generated content, online reviews are becoming extremely valuable sources for mining customers’ opinions on products and services. However, most of the traditional opinion mining methods are coarse-grained and cannot understand natural languages. Thus, aspect-based opinion mining and summarization are of great interest in academic and industrial research. In this paper, we study an approach to extract product and service aspect words, as well as sentiment words, automatically from reviews. An unsupervised dependency analysis-based approach is presented to extract Appraisal Expression Patterns (AEPs) from reviews, which represent the manner in which people express opinions regarding products or services and can be regarded as a condensed representation of the syntactic relationship between aspect and sentiment words. \{AEPs\} are high-level, domain-independent types of information, and have excellent domain adaptability. An AEP-based Latent Dirichlet Allocation (AEP-LDA) model is also proposed. This is a sentence-level, probabilistic generative model which assumes that all words in a sentence are drawn from one topic – a generally true assumption, based on our observation. The model also assumes that every review corpus is composed of several mutually corresponding aspect and sentiment topics, as well as a background word topic. The \{AEP\} information is incorporated into the AEP-LDA model for mining aspect and sentiment words simultaneously. The experimental results on reviews of restaurants, hotels, \{MP3\} players, and cameras show that the AEP-LDA model outperforms other approaches in identifying aspect and sentiment words. }
}
@article{Spasić2014605,
  title = {Text mining of cancer-related information: Review of current status and future directions },
  journal = {International Journal of Medical Informatics },
  volume = {83},
  number = {9},
  pages = {605 - 623},
  year = {2014},
  note = {},
  issn = {1386-5056},
  doi = {http://dx.doi.org/10.1016/j.ijmedinf.2014.06.009},
  url = {http://www.sciencedirect.com/science/article/pii/S1386505614001105},
  author = {Irena Spasić and Jacqueline Livsey and John A. Keane and Goran Nenadić},
  keywords = {Cancer},
  keywords = {Natural language processing},
  keywords = {Data mining},
  keywords = {Electronic medical records },
  abstract = {AbstractPurpose This paper reviews the research literature on text mining (TM) with the aim to find out (1) which cancer domains have been the subject of \{TM\} efforts, (2) which knowledge resources can support \{TM\} of cancer-related information and (3) to what extent systems that rely on knowledge and computational methods can convert text data into useful clinical information. These questions were used to determine the current state of the art in this particular strand of \{TM\} and suggest future directions in \{TM\} development to support cancer research. Methods A review of the research on \{TM\} of cancer-related information was carried out. A literature search was conducted on the Medline database as well as \{IEEE\} Xplore and \{ACM\} digital libraries to address the interdisciplinary nature of such research. The search results were supplemented with the literature identified through Google Scholar. Results A range of studies have proven the feasibility of \{TM\} for extracting structured information from clinical narratives such as those found in pathology or radiology reports. In this article, we provide a critical overview of the current state of the art for \{TM\} related to cancer. The review highlighted a strong bias towards symbolic methods, e.g. named entity recognition (NER) based on dictionary lookup and information extraction (IE) relying on pattern matching. The F-measure of \{NER\} ranges between 80% and 90%, while that of \{IE\} for simple tasks is in the high 90s. To further improve the performance, \{TM\} approaches need to deal effectively with idiosyncrasies of the clinical sublanguage such as non-standard abbreviations as well as a high degree of spelling and grammatical errors. This requires a shift from rule-based methods to machine learning following the success of similar trends in biological applications of TM. Machine learning approaches require large training datasets, but clinical narratives are not readily available for \{TM\} research due to privacy and confidentiality concerns. This issue remains the main bottleneck for progress in this area. In addition, there is a need for a comprehensive cancer ontology that would enable semantic representation of textual information found in narrative reports. }
}
@article{Ni2014511,
  title = {Data mining-based study on sub-mentally healthy state among residents in eight provinces and cities in China },
  journal = {Journal of Traditional Chinese Medicine },
  volume = {34},
  number = {4},
  pages = {511 - 517},
  year = {2014},
  note = {},
  issn = {0254-6272},
  doi = {http://dx.doi.org/10.1016/S0254-6272(15)30055-8},
  url = {http://www.sciencedirect.com/science/article/pii/S0254627215300558},
  author = {Hongmei Ni and Xuming Yang and Chengquan Fang and Yingying Guo and Mingyue Xu and Yumin He},
  keywords = {Questionnaires},
  keywords = {Mental health},
  keywords = {Data mining},
  keywords = {Strategic tree},
  keywords = {Artificial neural network },
  abstract = {AbstractObjective To apply data mining methods to research on the state of sub-mental health among residents in eight provinces and cities in China and to mine latent knowledge about many conditions through data mining and analysis of data on 3970 sub-mentallyhealthyindividualsselectedfrom13385 relevant questionnaires. Methods The strategic tree algorithm was used to identify the main manifestations of the state of sub-mental health. The backpropogation artificial neural network was used to analyze the main manifestations of sub-healthy mental states of three different degrees. A sub-mental health evaluation model was then established to achieve predictive evaluation results. Results Using classifications from the Scale of Chinese Sub-healthy State, the main manifestations of sub-mental health selected using the strategic tree were \{F1101\} (Do you lack peace of mind?), \{F1102\} (Are you easily nervous when something comes up?), and \{F1002\} (Do you often sigh?). The relative intensity of manifestations of sub-mental health was highest for F1101, followed by F1102, and then F1002. Through study of the neural network, better differentiation could be made between moderate and severe and between mild and severe states of sub-mental health. The differentiation between mild and moderate sub-mental health states was less apparent. Additionally, the sub-mental health state evaluation model, which could be used to predict states of sub-mental health of different individuals, was established using F1101, F1102, F1002, and the mental self-assessment total score. Conclusions The main manifestations of the state of sub-mental health can be discovered using data mining methods to research and analyze the latent laws and knowledge hidden in research evidence on the state of sub-mental health. The state of sub-mental health of different individuals can be rapidly predicted using the model established here. This can provide a basis for assessment and intervention for sub-mental health. It can also replace the relatively outdated approaches to research on sub-health in the technical era of information and digitization by combining the study of states of sub-mental health with information techniques and by further quantifying the relevant information. }
}
@article{Seret20144648,
  title = {A dynamic understanding of customer behavior processes based on clustering and sequence mining },
  journal = {Expert Systems with Applications },
  volume = {41},
  number = {10},
  pages = {4648 - 4657},
  year = {2014},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2014.01.022},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417414000438},
  author = {Alex Seret and Seppe K.L.M. vanden Broucke and Bart Baesens and Jan Vanthienen},
  keywords = {Clustering},
  keywords = {Sequence mining},
  keywords = {Business knowledge},
  keywords = {Behavior process},
  keywords = {Trajectories},
  keywords = {Direct marketing },
  abstract = {Abstract In this paper, a novel approach towards enabling the exploratory understanding of the dynamics inherent in the capture of customers’ data at different points in time is outlined. The proposed methodology combines state-of-art data mining clustering techniques with a tuned sequence mining method to discover prominent customer behavior trajectories in data bases, which — when combined — represent the “behavior process” as it is followed by particular groups of customers. The framework is applied to a real-life case of an event organizer; it is shown how behavior trajectories can help to explain consumer decisions and to improve business processes that are influenced by customer actions. }
}
@article{Lan20143450,
  title = {On-shelf utility mining with negative item values },
  journal = {Expert Systems with Applications },
  volume = {41},
  number = {7},
  pages = {3450 - 3459},
  year = {2014},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2013.10.049},
  url = {http://www.sciencedirect.com/science/article/pii/S095741741300866X},
  author = {Guo-Cheng Lan and Tzung-Pei Hong and Jen-Peng Huang and Vincent S. Tseng},
  keywords = {Data mining},
  keywords = {Utility mining},
  keywords = {On-shelf utility mining},
  keywords = {High on-shelf utility itemset},
  keywords = {Negative profit },
  abstract = {Abstract On-shelf utility mining has recently received interest in the data mining field due to its practical considerations. On-shelf utility mining considers not only profits and quantities of items in transactions but also their on-shelf time periods in stores. Profit values of items in traditional on-shelf utility mining are considered as being positive. However, in real-world applications, items may be associated with negative profit values. This paper proposes an efficient three-scan mining approach to efficiently find high on-shelf utility itemsets with negative profit values from temporal databases. In particular, an effective itemset generation method is developed to avoid generating a large number of redundant candidates and to effectively reduce the number of data scans in mining. Experimental results for several synthetic and real datasets show that the proposed approach has good performance in pruning effectiveness and execution efficiency. }
}
@article{Zheng2014757,
  title = {Study on the Method of Road Transport Management Information Data Mining based on Pruning Eclat Algorithm and MapReduce },
  journal = {Procedia - Social and Behavioral Sciences },
  volume = {138},
  number = {0},
  pages = {757 - 766},
  year = {2014},
  note = {The 9th International Conference on Traffic and Transportation Studies (ICTTS 2014) },
  issn = {1877-0428},
  doi = {http://dx.doi.org/10.1016/j.sbspro.2014.07.254},
  url = {http://www.sciencedirect.com/science/article/pii/S1877042814041743},
  author = {Xiaofeng Zheng and Shu Wang},
  keywords = {Road transport},
  keywords = {Association rules},
  keywords = {Eclat},
  keywords = {MapReduce},
  keywords = {Data Mining },
  abstract = {Abstract Road transport management information is a class of massive and correlation data in \{ITS\} (intelligent transportation systems), and its association rules data mining has important practical significance. In order to cover the shortage of the classical association rules optimized algorithm Eclat, this paper proposed and demonstrated that candidate sets which have the project as a prefix or suffix can be pruning calculated for both the properties. Then it proposed optimized method of frequent sets calculation-a method of parallel \{NEclat\} combining with cloud programming model. This method can solve the problem that Eclat algorithm cannot be calculated by pruning, and achieve a parallel compute. The practical application showed that, this method can reduce time waste by more than 40%, and it is suitable for the data mining of transport management information association rules. }
}
@article{Yun20143861,
  title = {High utility itemset mining with techniques for reducing overestimated utilities and pruning candidates },
  journal = {Expert Systems with Applications },
  volume = {41},
  number = {8},
  pages = {3861 - 3878},
  year = {2014},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2013.11.038},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417413009585},
  author = {Unil Yun and Heungmo Ryang and Keun Ho Ryu},
  keywords = {Candidate pruning},
  keywords = {Data mining},
  keywords = {High utility itemsets},
  keywords = {Single-pass tree construction},
  keywords = {Utility mining },
  abstract = {Abstract High utility itemset mining considers the importance of items such as profit and item quantities in transactions. Recently, mining high utility itemsets has emerged as one of the most significant research issues due to a huge range of real world applications such as retail market data analysis and stock market prediction. Although many relevant algorithms have been proposed in recent years, they incur the problem of generating a large number of candidate itemsets, which degrade mining performance. In this paper, we propose an algorithm named MU-Growth (Maximum Utility Growth) with two techniques for pruning candidates effectively in mining process. Moreover, we suggest a tree structure, named MIQ-Tree (Maximum Item Quantity Tree), which captures database information with a single-pass. The proposed data structure is restructured for reducing overestimated utilities. Performance evaluation shows that MU-Growth not only decreases the number of candidates but also outperforms state-of-the-art tree-based algorithms with overestimated methods in terms of runtime with a similar memory usage. }
}
@article{Lai2014267,
  title = {Towards semantically secure outsourcing of association rule mining on categorical data },
  journal = {Information Sciences },
  volume = {267},
  number = {0},
  pages = {267 - 286},
  year = {2014},
  note = {},
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2014.01.040},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025514000760},
  author = {Junzuo Lai and Yingjiu Li and Robert H. Deng and Jian Weng and Chaowen Guan and Qiang Yan},
  keywords = {Association rule mining},
  keywords = {Outsourcing},
  keywords = {Semantic security},
  keywords = {Privacy},
  keywords = {Soundness },
  abstract = {Abstract When outsourcing association rule mining to cloud, it is critical for data owners to protect both sensitive raw data and valuable mining results from being snooped at cloud servers. Previous solutions addressing this concern add random noise to the raw data and/or encrypt the raw data with a substitution mapping. However, these solutions do not provide semantic security; partial information about raw data or mining results can be potentially discovered by an adversary at cloud servers under a reasonable assumption that the adversary knows some plaintext–ciphertext pairs. In this paper, we propose the first semantically secure solution for outsourcing association rule mining with both data privacy and mining privacy. In our solution, we assume that the data is categorical. Additionally, our solution is sound, which enables data owners to verify whether there exists any false data in the mining results returned by a cloud server. Experimental study shows that our solution is feasible and efficient. }
}
@article{Yoon2014287,
  title = {Exploring technological opportunities by linking technology and products: Application of morphology analysis and text mining },
  journal = {Technological Forecasting and Social Change },
  volume = {86},
  number = {0},
  pages = {287 - 303},
  year = {2014},
  note = {},
  issn = {0040-1625},
  doi = {http://dx.doi.org/10.1016/j.techfore.2013.10.013},
  url = {http://www.sciencedirect.com/science/article/pii/S0040162513002710},
  author = {Byungun Yoon and Inchae Park and Byoung-youl Coh},
  keywords = {Technological opportunity discovery},
  keywords = {Morphology analysis},
  keywords = {Text mining },
  abstract = {Abstract The technological opportunity discovery (TOD) can be divided into two types: anticipating new technology and applying existing technology. The latter is useful for small and medium companies, which have weak technology forecasting capability. Thus, this research aims to suggest the methodology for the \{TOD\} based on existing technology by using morphology analysis and text mining. The extracted results of \{TOD\} are classified into three categories based on the types of product — existing, applied, and heterogeneous product. To illustrate the process and validate the utility of application, \{LED\} heat dissipation technology and \{LED\} lamps are selected as the technology and product for the illustration. The method contributes to suggest a semi-automated normative method for technology forecasting by combining morphology analysis and text mining. }
}
@article{Xu2014468,
  title = {Mining temporal explicit and implicit semantic relations between entities using web search engines },
  journal = {Future Generation Computer Systems },
  volume = {37},
  number = {0},
  pages = {468 - 477},
  year = {2014},
  note = {Special Section: Innovative Methods and Algorithms for Advanced Data-Intensive Computing Special Section: Semantics, Intelligent processing and services for big data Special Section: Advances in Data-Intensive Modelling and Simulation Special Section: Hybrid Intelligence for Growing Internet and its Applications },
  issn = {0167-739X},
  doi = {http://dx.doi.org/10.1016/j.future.2013.09.027},
  url = {http://www.sciencedirect.com/science/article/pii/S0167739X13002069},
  author = {Zheng Xu and Xiangfeng Luo and Shunxiang Zhang and Xiao Wei and Lin Mei and Chuanping Hu},
  keywords = {Temporal semantic relation},
  keywords = {Semantic annotation},
  keywords = {Content analysis},
  keywords = {Web mining },
  abstract = {Abstract In this paper, we study the problem of mining temporal semantic relations between entities. The goal of the studied problem is to mine and annotate a semantic relation with temporal, concise, and structured information, which can release the explicit, implicit, and diversity semantic relations between entities. The temporal semantic annotations can help users to learn and understand the unfamiliar or new emerged semantic relations between entities. The proposed temporal semantic annotation structure integrates the features from \{IEEE\} and Renlifang. We propose a general method to generate temporal semantic annotation of a semantic relation between entities by constructing its connection entities, lexical syntactic patterns, context sentences, context graph, and context communities. Empirical experiments on two different datasets including a LinkedIn dataset and movie star dataset show that the proposed method is effective and accurate. Different from the manually generated annotation repository such as Wikipedia and LinkedIn, the proposed method can automatically mine the semantic relation between entities and does not need any prior knowledge such as ontology or the hierarchical knowledge base. The proposed method can be used on some applications, which proves the effectiveness of the proposed temporal semantic relations on many web mining tasks. }
}
@article{Moonsamy2014122,
  title = {Mining permission patterns for contrasting clean and malicious android applications },
  journal = {Future Generation Computer Systems },
  volume = {36},
  number = {0},
  pages = {122 - 132},
  year = {2014},
  note = {Special Section: Intelligent Big Data Processing Special Section: Behavior Data Security Issues in Network Information Propagation Special Section: Energy-efficiency in Large Distributed Computing Architectures Special Section: eScience Infrastructure and Applications },
  issn = {0167-739X},
  doi = {http://dx.doi.org/10.1016/j.future.2013.09.014},
  url = {http://www.sciencedirect.com/science/article/pii/S0167739X13001933},
  author = {Veelasha Moonsamy and Jia Rong and Shaowu Liu},
  keywords = {Android permission},
  keywords = {Data mining},
  keywords = {Biclustering},
  keywords = {Contrast mining},
  keywords = {Permission pattern },
  abstract = {Abstract An Android application uses a permission system to regulate the access to system resources and users’ privacy-relevant information. Existing works have demonstrated several techniques to study the required permissions declared by the developers, but little attention has been paid towards used permissions. Besides, no specific permission combination is identified to be effective for malware detection. To fill these gaps, we have proposed a novel pattern mining algorithm to identify a set of contrast permission patterns that aim to detect the difference between clean and malicious applications. A benchmark malware dataset and a dataset of 1227 clean applications has been collected by us to evaluate the performance of the proposed algorithm. Valuable findings are obtained by analyzing the returned contrast permission patterns. }
}
@article{Mustapaşa20111381,
  title = {“Hello world”, web mining for e-learning },
  journal = {Procedia Computer Science },
  volume = {3},
  number = {0},
  pages = {1381 - 1387},
  year = {2011},
  note = {World Conference on Information Technology },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2011.01.019},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050911000202},
  author = {Oğuz Mustapaşa and Adem Karahoca and Dilek Karahoca and Hüseyin Uzunboylu},
  keywords = {Semantic web},
  keywords = {Web mining},
  keywords = {E-learning},
  keywords = {Distance learning},
  keywords = {Personalization },
  abstract = {As the internet and mobile applications are getting an important role in our lives, usage of mobile services also took place in educational field since the internet is widespread, which is usually called by the terms “e-learning” or “distance learning”. A known issue on e-learning is all the content’s being online and less face-to-face communication than traditional learning; this brings the problem of chasing student’s success, and advising and managing student’s way of studying. Hence, a recent hot topic, data mining, can be applied on student’s data left on e-learning portals to guide the instructor and advisors to help students’ being more successful. Recent researches done on this topic showed that e-learning combined with data mining can decrease the gap between itself and traditional learning — referred as semantic web mining in general. }
}
@article{Xu201599,
  title = {Topic based context-aware travel recommendation method exploiting geotagged photos },
  journal = {Neurocomputing },
  volume = {155},
  number = {0},
  pages = {99 - 107},
  year = {2015},
  note = {},
  issn = {0925-2312},
  doi = {http://dx.doi.org/10.1016/j.neucom.2014.12.043},
  url = {http://www.sciencedirect.com/science/article/pii/S092523121401707X},
  author = {Zhenxing Xu and Ling Chen and Gencai Chen},
  keywords = {Spatiotemporal data mining},
  keywords = {Geotagged photo},
  keywords = {Context-aware query},
  keywords = {Travel recommender system },
  abstract = {Abstract The popularity of camera phones and photo sharing websites, e.g. Flickr and Panoramio, has led to huge volumes of community-contributed geotagged photos (CCGPs) available on the Internet, which could be regarded as digital footprints of photo takers. In this paper, we propose a method to recommend travel locations in a city for a user, based on topic distribution of his travel histories in other cities and the given context (i.e., season and weather). A topic model is used to mine the interest distribution of users, which is then exploited to build the user–user similarity model and make travel recommendations. The season and weather context information is considered during the mining and the recommendation processes. Our method is evaluated on a Flickr dataset, which contains photos taken in 11 cities of China. Experimental results show the effectiveness of the proposed method in terms of the precision of travel behavior prediction. }
}
@article{Velardi2014153,
  title = {Twitter mining for fine-grained syndromic surveillance },
  journal = {Artificial Intelligence in Medicine },
  volume = {61},
  number = {3},
  pages = {153 - 163},
  year = {2014},
  note = {Text Mining and Information Analysis of Health Documents },
  issn = {0933-3657},
  doi = {http://dx.doi.org/10.1016/j.artmed.2014.01.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0933365714000049},
  author = {Paola Velardi and Giovanni Stilo and Alberto E. Tozzi and Francesco Gesualdo},
  keywords = {Terminology clustering},
  keywords = {Twitter mining},
  keywords = {Micro-blog mining},
  keywords = {Patient's language learning},
  keywords = {Syndromic surveillance },
  abstract = {AbstractBackground Digital traces left on the Internet by web users, if properly aggregated and analyzed, can represent a huge information dataset able to inform syndromic surveillance systems in real time with data collected directly from individuals. Since people use everyday language rather than medical jargon (e.g. runny nose vs. respiratory distress), knowledge of patients’ terminology is essential for the mining of health related conversations on social networks. Objectives In this paper we present a methodology for early detection and analysis of epidemics based on mining Twitter messages. In order to reliably trace messages of patients that actually complain of a disease, first, we learn a model of naïve medical language, second, we adopt a symptom-driven, rather than disease-driven, keyword analysis. This approach represents a major innovation compared to previous published work in the field. Method We first developed an algorithm to automatically learn a variety of expressions that people use to describe their health conditions, thus improving our ability to detect health-related “concepts” expressed in non-medical terms and, in the end, producing a larger body of evidence. We then implemented a Twitter monitoring instrument to finely analyze the presence and combinations of symptoms in tweets. Results We first evaluate the algorithm's performance on an available dataset of diverse medical condition synonyms, then, we assess its utility in a case study of five common syndromes for surveillance purposes. We show that, by exploiting physicians’ knowledge on symptoms positively or negatively related to a given disease, as well as the correspondence between patients’ “naïve” terminology and medical jargon, not only can we analyze large volumes of Twitter messages related to that disease, but we can also mine micro-blogs with complex queries, performing fine-grained tweets classification (e.g. those reporting influenza-like illness (ILI) symptoms vs. common cold or allergy). Conclusions Our approach yields a very high level of correlation with flu trends derived from traditional surveillance systems. Compared with Google Flu, another popular tool based on query search volumes, our method is more flexible and less sensitive to changes in web search behaviors. }
}
@article{Yang201417,
  title = {Data mining for rapid prediction of facility fit and debottlenecking of biomanufacturing facilities },
  journal = {Journal of Biotechnology },
  volume = {179},
  number = {0},
  pages = {17 - 25},
  year = {2014},
  note = {},
  issn = {0168-1656},
  doi = {http://dx.doi.org/10.1016/j.jbiotec.2014.03.004},
  url = {http://www.sciencedirect.com/science/article/pii/S0168165614001072},
  author = {Yang Yang and Suzanne S. Farid and Nina F. Thornhill},
  keywords = {Biopharmaceutical manufacture},
  keywords = {Stochastic discrete-event simulation},
  keywords = {Decision tree classification},
  keywords = {Multivariate data analysis},
  keywords = {Data mining },
  abstract = {Abstract Higher titre processes can pose facility fit challenges in legacy biopharmaceutical purification suites with capacities originally matched to lower titre processes. Bottlenecks caused by mismatches in equipment sizes, combined with process fluctuations upon scale-up, can result in discarding expensive product. This paper describes a data mining decisional tool for rapid prediction of facility fit issues and debottlenecking of biomanufacturing facilities exposed to batch-to-batch variability and higher titres. The predictive tool comprised advanced multivariate analysis techniques to interrogate Monte Carlo stochastic simulation datasets that mimicked batch fluctuations in cell culture titres, step yields and chromatography eluate volumes. A decision tree classification method, \{CART\} (classification and regression tree) was introduced to explore the impact of these process fluctuations on product mass loss and reveal the root causes of bottlenecks. The resulting pictorial decision tree determined a series of if-then rules for the critical combinations of factors that lead to different mass loss levels. Three different debottlenecking strategies were investigated involving changes to equipment sizes, using higher capacity chromatography resins and elution buffer optimisation. The analysis compared the impact of each strategy on mass output, direct cost of goods per gram and processing time, as well as consideration of extra capital investment and space requirements. }
}
@article{Truyens2014153,
  title = {Legal aspects of text mining },
  journal = {Computer Law & Security Review },
  volume = {30},
  number = {2},
  pages = {153 - 170},
  year = {2014},
  note = {},
  issn = {0267-3649},
  doi = {http://dx.doi.org/10.1016/j.clsr.2014.01.009},
  url = {http://www.sciencedirect.com/science/article/pii/S0267364914000260},
  author = {Maarten Truyens and Patrick Van Eecke},
  keywords = {Copyright},
  keywords = {Text mining},
  keywords = {Data mining},
  keywords = {Reproduction right},
  keywords = {Databases },
  abstract = {Abstract “Text mining” covers a range of techniques that allow software to extract information from text documents. It is not a new technology, but it has recently received spotlight attention due to the emergence of Big Data. The applications of text mining are very diverse and span multiple disciplines, ranging from biomedicine to legal, business intelligence and security. From a legal perspective, text mining touches upon several areas of law, including contract law, copyright law and database law. This contribution discusses the legal issues encountered during the assembly of texts into so-called “corpora”, as well as the use of such corpora. }
}
@article{Jeong20143605,
  title = {Creating patents on the new technology using analogy-based patent mining },
  journal = {Expert Systems with Applications },
  volume = {41},
  number = {8},
  pages = {3605 - 3614},
  year = {2014},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2013.11.045},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417413009871},
  author = {Cheolhyun Jeong and Kwangsoo Kim},
  keywords = {Analogy},
  keywords = {New technology},
  keywords = {Problem solved concept},
  keywords = {Patent mining},
  keywords = {Patent mapping},
  keywords = {Patent similarity },
  abstract = {Abstract Patents on the new technology–a technology not yet commercialized and in an early stage of its life cycle–give firms many benefits. However, existing methods are inadequate because of dependencies on customers and physical prototypes. And there is lack of systems, focused on a problem identification process or an inter-technological comparison. In this research, to remedy existing limitations, analogy-based patent mining system is suggested. The system is developed based on an assumption that similar problems would occur in technologies that have similar properties or functions. So, the system is focused on identification of a Problem Solved Concept (PSC), which describes what problem is solved in the patent. At the first part of the system, the mature technology–a technology relatively matured than the new technology–is described with a property and a function; one of the property or the function should be similar to which of the new technology considered. And the system extract PSCs, construct patent map, and evaluate \{PSCs\} utilizing patents on the new and the mature technologies. As a result, the \{PSCs\} with high opportunities are revealed and patents related to the \{PSCs\} are examined. Then users of this system select some patents as resources for analogy. The system is tested by a case study of wireless charger technology. For the case study, 352 patents on wireless router technology and 227 patents on wireless charger technology are used. At the final, patents related to ‘handoff’, showed a high opportunity score and one of the patents is introduced to show the possibility of patent creation through analogy. }
}
@article{Hashem20142914,
  title = {An efficient approach for mining cross-level closed itemsets and minimal association rules using closed itemset lattices },
  journal = {Expert Systems with Applications },
  volume = {41},
  number = {6},
  pages = {2914 - 2938},
  year = {2014},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2013.09.052},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417413008415},
  author = {Tahrima Hashem and Chowdhury Farhan Ahmed and Md. Samiullah and Sayma Akther and Byeong-Soo Jeong and Seokhee Jeon},
  keywords = {Data mining},
  keywords = {Association rules},
  keywords = {Frequent itemset},
  keywords = {Closed itemset},
  keywords = {Minimal rules},
  keywords = {Closed itemset lattice },
  abstract = {Abstract Multilevel knowledge in transactional databases plays a significant role in our real-life market basket analysis. Many researchers have mined the hierarchical association rules and thus proposed various approaches. However, some of the existing approaches produce many multilevel and cross-level association rules that fail to convey quality information. From these large number of redundant association rules, it is extremely difficult to extract any meaningful information. There also exist some approaches that mine minimal association rules, but these have many shortcomings due to their naïve-based approaches. In this paper, we have focused on the need for generating hierarchical minimal rules that provide maximal information. An algorithm has been proposed to derive minimal multilevel association rules and cross-level association rules. Our work has made significant contributions in mining the minimal cross-level association rules, which express the mixed relationship between the generalized and specialized view of the transaction itemsets. We are the first to design an efficient algorithm using a closed itemset lattice-based approach, which can mine the most relevant minimal cross-level association rules. The parent–child relationship of the lattices has been exploited while mining cross-level closed itemset lattices. We have extensively evaluated our proposed algorithm’s efficiency using a variety of real-life datasets and performing a large number of experiments. The proposed algorithm has outperformed the existing related work significantly during the pervasive performance comparison. }
}
@article{Debeljak201430,
  title = {Modelling forest growing stock from inventory data: A data mining approach },
  journal = {Ecological Indicators },
  volume = {41},
  number = {0},
  pages = {30 - 39},
  year = {2014},
  note = {},
  issn = {1470-160X},
  doi = {http://dx.doi.org/10.1016/j.ecolind.2014.01.010},
  url = {http://www.sciencedirect.com/science/article/pii/S1470160X14000181},
  author = {Marko Debeljak and Aleš Poljanec and Bernard Ženko},
  keywords = {Forest growing stock},
  keywords = {Ecological indicator},
  keywords = {Forest inventory data},
  keywords = {Data mining},
  keywords = {Trend prediction },
  abstract = {Abstract Growing stock is an ecological indicator of forest ecosystem response to natural and anthropogenic impacts that may result from forest management measures or environmental impacts. Information on growing stock is thus essential to understand dynamics of forest stands, their productive capacity and to manage their use within limits of sustainability. Dynamic changes of forest growing stock, as well as predictions of their future development, are usually estimated from the data gathered by national forest inventories using some mechanistic modelling approach. The resulting models are informative, but include many parameters, some of which are difficult to set or estimate. Due to the demanding parameterisation of mechanistic models, it is hard to achieve stability of their output accuracy, which can lower their predictive power. This paper presents an alternative and complementary approach of constructing models with machine learning and data mining methods. We applied these methods to the Silva-SI database and used the resulting interpretable models in order to find explanations for structural changes in Slovenian forests over the period from year 1970 to 2010. In addition, we developed predictive models for growing sock in the decade from year 2010 to 2020. The structure of the models describing temporal dynamics of growing stock shows that trends of growing stock are increasing for the entire studied period, while accumulation of growing stock is much more intensive after 1990. Forests with a lower growing stock are located either in the areas with non-favorable site conditions for forest growth, or at lower altitudes, where they are more exposed to human exploitation due to their vicinity to more densely populated regions. Predictions of growing stock for the decade 2010–2020 suggest that Slovenian forests will continue to accumulate their growing stock (private owned forests to 327 m3/ha and state owned forests to 343 m3/ha in 2020). The presented data mining approach that was here applied to the growing stock can also be used for investigating other ecological indicators. }
}
@article{Kwon2014721,
  title = {A real time process management system using \{RFID\} data mining },
  journal = {Computers in Industry },
  volume = {65},
  number = {4},
  pages = {721 - 732},
  year = {2014},
  note = {},
  issn = {0166-3615},
  doi = {http://dx.doi.org/10.1016/j.compind.2014.02.007},
  url = {http://www.sciencedirect.com/science/article/pii/S0166361514000414},
  author = {Kyunglag Kwon and Daehyun Kang and Yeochang Yoon and Jong-Soo Sohn and In-Jeong Chung},
  keywords = {Process management},
  keywords = {\{RFID\}},
  keywords = {Data mining},
  keywords = {Procedure Tree},
  keywords = {Enterprise Resource Planning},
  keywords = {Real time system },
  abstract = {Abstract Recently, there have been numerous efforts to fuse the latest Radio Frequency Identification (RFID) technology with the Enterprise Information System (EIS). However, in most cases these attempts are centered mainly on the simultaneous multiple reading capability of \{RFID\} technology, and thus neglect the management of massive data generated from the \{RFID\} reader. As a result, it is difficult to obtain flow information for \{RFID\} data mining related to real time process control. In this paper, we propose an advanced process management method, called ‘Procedure Tree’ (PT), for \{RFID\} data mining. Using the suggested PT, we are able to manage massive \{RFID\} data effectively, and perform real time process management efficiently. Then we evaluate the efficiency of the proposed method, after applying it to a real time process control system connected to the RFID-based EIS. For the verification of the suggested system, we collect an enormous amount of data in the Enterprise Resource Planning (ERP) database, analyze characteristics of the collected data, and then compute the elapsed time on each stage in process control. The suggested system was able to perform what the traditional RFID-based process control systems failed to do, such as predicting and tracking of real time process and inventory control. }
}
@article{Hsieh201417,
  title = {Reducing the bottleneck of graph-based data mining by improving the efficiency of labeled graph isomorphism testing },
  journal = {Data & Knowledge Engineering },
  volume = {91},
  number = {0},
  pages = {17 - 33},
  year = {2014},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2014.02.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X14000172},
  author = {Shu-Ming Hsieh and Chiun-Chieh Hsu and Yen-Wu Ti and Chi-Jung Kuo},
  keywords = {Data mining},
  keywords = {Mining methods and algorithm},
  keywords = {Isomorphism testing},
  keywords = {Graph signature},
  keywords = {Search state-space tree },
  abstract = {Abstract Due to the complex nature of graph representations, the isomorphism testing between a pair of labeled graphs becomes one of the most time-consuming procedures during the process of graph-based data mining. In order to reduce this bottleneck, in this paper we propose a novel efficient algorithm to perform isomorphism testing of labeled graphs which in general performs less state-space tree searching. The proposed method uses graph signatures as the first-step filter, and it limits the backtracking occurring only between each pair of corresponding vertex classes, based on the proposed data structures and the vertex partition method. We compared the proposed method with state-of-the-art methods to verify its efficiency for several datasets each with different aspects of characteristics. The experimental results show that for irregular graphs, either labeled or unlabeled, the proposed method outperforms the compared methods in efficiency. For graphs with multiple labels but high regularity, the proposed method is still better than the compared methods. The result of this algorithm is directly applicable to those emerging applications related to graph-based data mining which need to perform isomorphism testing of labeled graphs in large databases. }
}
@article{Liu2011370,
  title = {Mining group-based knowledge flows for sharing task knowledge },
  journal = {Decision Support Systems },
  volume = {50},
  number = {2},
  pages = {370 - 386},
  year = {2011},
  note = {},
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/j.dss.2010.09.004},
  url = {http://www.sciencedirect.com/science/article/pii/S0167923610001739},
  author = {Duen-Ren Liu and Chin-Hui Lai},
  keywords = {Knowledge flow},
  keywords = {Group-based knowledge flow},
  keywords = {Knowledge graph},
  keywords = {Knowledge sharing},
  keywords = {Data mining},
  keywords = {Topic},
  keywords = {Task },
  abstract = {In an organization, knowledge is the most important resource in the creation of core competitive advantages. It is circulated and accumulated by knowledge flows (KFs) in the organization to support workers' task needs. Because workers accumulate knowledge of different domains, they may cooperate and participate in several task-based groups to satisfy their needs. In this paper, we propose algorithms that integrate information retrieval and data mining techniques to mine and construct group-based \{KFs\} (GKFs) for task-based groups. A \{GKF\} is expressed as a directed knowledge graph which represents the knowledge referencing behavior, or knowledge flow, of a group of workers with similar task needs. Task-related knowledge topics and their relationships (flows) can be identified from the knowledge graph so as to fulfill workers' task needs and promote knowledge sharing for collaboration of group members. Moreover, the frequent knowledge referencing path can be identified from the knowledge graph to indicate the frequent knowledge flow of the workers. To demonstrate the efficacy of the proposed methods, we implement a prototype of the \{GKF\} mining system. Our \{GKF\} mining methods can enhance organizational learning and facilitate knowledge management, sharing, and reuse in an environment where collaboration and teamwork are essential. }
}
@article{Wijnhoven2014262,
  title = {External validity of sentiment mining reports: Can current methods identify demographic biases, event biases, and manipulation of reviews? },
  journal = {Decision Support Systems },
  volume = {59},
  number = {0},
  pages = {262 - 273},
  year = {2014},
  note = {},
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/j.dss.2013.12.005},
  url = {http://www.sciencedirect.com/science/article/pii/S0167923613002947},
  author = {Fons Wijnhoven and Oscar Bloemen},
  keywords = {Sentiment mining},
  keywords = {Opinion mining},
  keywords = {External validity},
  keywords = {Demographic bias},
  keywords = {Event bias},
  keywords = {Product review manipulation},
  keywords = {Design proposition validation },
  abstract = {Abstract Many publications in sentiment mining provide new techniques for improved accuracy in extracting features and corresponding sentiments in texts. For the external validity of these sentiment reports, i.e., the applicability of the results to target audiences, it is important to well analyze data of the context of user-generated content and their sample of authors. The literature lacks an analysis of external validity of sentiment mining reports and the sentiment mining field lacks an operationalization of external validity dimensions toward practically useful techniques. From a kernel theory, we identify multiple threats to sentiment mining external validity and study three of them empirically 1) a mismatch in demographics of the reviewers sample, 2) bias due to reviewers' incidental experiences, and 3) manipulation of reviews. The value of external validity threat identifying techniques is next examined in cases from Goodread.com. We conclude that demographic biases can be well detected by current techniques, although we have doubts regarding stylometric techniques for this purpose. We demonstrate the usefulness of event and manipulation bias detection techniques in our cases, but this result needs further replications in more complex and more competitive contexts. Finally, for increasing the decisional usefulness of sentiment mining reports, they should be accompanied by external validity reports and software and service providers in this field should incorporate these in their offerings. }
}
@article{Liew2014393,
  title = {Sustainability trends in the process industries: A text mining-based analysis },
  journal = {Computers in Industry },
  volume = {65},
  number = {3},
  pages = {393 - 400},
  year = {2014},
  note = {\{ICT\} for Sustainability in Industry },
  issn = {0166-3615},
  doi = {http://dx.doi.org/10.1016/j.compind.2014.01.004},
  url = {http://www.sciencedirect.com/science/article/pii/S0166361514000207},
  author = {Wan Te Liew and Arief Adhitya and Rajagopalan Srinivasan},
  keywords = {Sustainability trends},
  keywords = {Text mining},
  keywords = {TF-IDF},
  keywords = {Ontology},
  keywords = {Chemical industry},
  keywords = {Corporate Social Responsibility Report },
  abstract = {Abstract Sustainability is widely recognized as one of the most important challenges facing the world today. Companies publish sustainability reports that present their efforts and achievements in meeting sustainability goals and targets. In this paper, text mining is used to identify sustainability trends and practices in the process industries. Four main sectors of the industry are studied: oil/petrochemicals, bulk/specialty chemicals, pharmaceuticals, and consumer products. Our study reveals that the top sustainability focuses of the four sectors are very similar: health and safety, human rights, reducing GHG, conserving energy/energy efficiency, and community investment. Sector-specific sustainability issues have also been identified, for example oil spill prevention in the oil/petrochemicals sector and access to medicine in the pharmaceuticals sector. Environment is identified to be the predominant sustainability aspect in the process industries. The text mining methodology, results, and findings are detailed in the paper. }
}
@article{Graening2014166,
  title = {Shape mining: A holistic data mining approach for engineering design },
  journal = {Advanced Engineering Informatics },
  volume = {28},
  number = {2},
  pages = {166 - 185},
  year = {2014},
  note = {},
  issn = {1474-0346},
  doi = {http://dx.doi.org/10.1016/j.aei.2014.03.002},
  url = {http://www.sciencedirect.com/science/article/pii/S1474034614000184},
  author = {Lars Graening and Bernhard Sendhoff},
  keywords = {Computer aided engineering},
  keywords = {Data mining},
  keywords = {Unified design representation},
  keywords = {Design concepts},
  keywords = {Sensitivity & interaction analysis},
  keywords = {Passenger car design },
  abstract = {Abstract Although the integration of engineering data within the framework of product data management systems has been successful in the recent years, the holistic analysis (from a systems engineering perspective) of multi-disciplinary data or data based on different representations and tools is still not realized in practice. At the same time, the application of advanced data mining techniques to complete designs is very promising and bears a high potential for synergy between different teams in the development process. In this paper, we propose shape mining as a framework to combine and analyze data from engineering design across different tools and disciplines. In the first part of the paper, we introduce unstructured surface meshes as meta-design representations that enable us to apply sensitivity analysis, design concept retrieval and learning as well as methods for interaction analysis to heterogeneous engineering design data. We propose a new measure of relevance to evaluate the utility of a design concept. In the second part of the paper, we apply the formal methods to passenger car design. We combine data from different representations, design tools and methods for a holistic analysis of the resulting shapes. We visualize sensitivities and sensitive cluster centers (after feature reduction) on the car shape. Furthermore, we are able to identify conceptual design rules using tree induction and to create interaction graphs that illustrate the interrelation between spatially decoupled surface areas. Shape data mining in this paper is studied for a multi-criteria aerodynamic problem, i.e. drag force and rear lift, however, the extension to quality criteria from different disciplines is straightforward as long as the meta-design representation is still applicable. }
}
@article{PeñaAyala20141432,
  title = {Educational data mining: A survey and a data mining-based analysis of recent works },
  journal = {Expert Systems with Applications },
  volume = {41},
  number = {4, Part 1},
  pages = {1432 - 1462},
  year = {2014},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2013.08.042},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417413006635},
  author = {Alejandro Peña-Ayala},
  keywords = {Data mining},
  keywords = {Educational data mining},
  keywords = {Data mining profile},
  keywords = {Educational data mining approach pattern},
  keywords = {Pattern for descriptive and predictive educational data mining approaches },
  abstract = {Abstract This review pursues a twofold goal, the first is to preserve and enhance the chronicles of recent educational data mining (EDM) advances development; the second is to organize, analyze, and discuss the content of the review based on the outcomes produced by a data mining (DM) approach. Thus, as result of the selection and analysis of 240 \{EDM\} works, an \{EDM\} work profile was compiled to describe 222 \{EDM\} approaches and 18 tools. A profile of the \{EDM\} works was organized as a raw data base, which was transformed into an ad-hoc data base suitable to be mined. As result of the execution of statistical and clustering processes, a set of educational functionalities was found, a realistic pattern of \{EDM\} approaches was discovered, and two patterns of value-instances to depict \{EDM\} approaches based on descriptive and predictive models were identified. One key finding is: most of the \{EDM\} approaches are ground on a basic set composed by three kinds of educational systems, disciplines, tasks, methods, and algorithms each. The review concludes with a snapshot of the surveyed \{EDM\} works, and provides an analysis of the \{EDM\} strengths, weakness, opportunities, and threats, whose factors represent, in a sense, future work to be fulfilled. }
}
@article{Qu2014544,
  title = {Pattern mining of cloned codes in software systems },
  journal = {Information Sciences },
  volume = {259},
  number = {0},
  pages = {544 - 554},
  year = {2014},
  note = {},
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2010.04.022},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025510001787},
  author = {Wei Qu and Yuanyuan Jia and Michael Jiang},
  keywords = {Pattern mining},
  keywords = {Software clone detection},
  keywords = {Software reuse detection},
  keywords = {Software engineering },
  abstract = {Pattern mining of cloned codes in software systems is a challenging task due to various modifications and the large size of software codes. Most existing approaches adopt a token-based software representation and use sequential analysis for pattern mining of cloned codes. Due to the intrinsic limitations of such spatial space analysis, these methods have difficulties handling statement reordering, insertion and control replacement. Recently, graph-based models such as program dependent graph have been exploited to solve these issues. Although they can improve the performance in terms of accuracy, they introduce additional problems. Their computational complexity is very high and dramatically increases with the software size, thus limiting their applications in practice. In this paper, we propose a novel pattern mining framework for cloned codes in software systems. It efficiently exploits software’s spatial space information as well as graph space information and thus can mine accurate patterns of cloned codes for software systems. Preliminary experimental results have demonstrated the superior performance of the proposed approach compared with other methods. }
}
@article{Pachidi2014583,
  title = {Understanding users’ behavior with software operation data mining },
  journal = {Computers in Human Behavior },
  volume = {30},
  number = {0},
  pages = {583 - 594},
  year = {2014},
  note = {},
  issn = {0747-5632},
  doi = {http://dx.doi.org/10.1016/j.chb.2013.07.049},
  url = {http://www.sciencedirect.com/science/article/pii/S0747563213002884},
  author = {Stella Pachidi and Marco Spruit and Inge van de Weerd},
  keywords = {Software usage},
  keywords = {User behavior},
  keywords = {Software operation knowledge},
  keywords = {Software analytics},
  keywords = {Log data},
  keywords = {Data mining },
  abstract = {Abstract Software usage concerns knowledge about how end-users use the software in the field, and how the software itself responds to their actions. In this paper, we present the Usage Mining Method to guide the analysis of data collected during software operation, in order to extract knowledge about how a software product is used by the end-users. Our method suggests three analysis tasks which employ data mining techniques for extracting usage knowledge from software operation data: users profiling, clickstream analysis and classification analysis. The Usage Mining Method was evaluated through a prototype that was executed in the case of Exact Online, the main online financial management application in the Netherlands. The evaluation confirmed the supportive role of the Usage Mining Method in software product management and development processes, as well as the applicability of the suggested data mining algorithms to carry out the usage analysis tasks. }
}
@article{Lee2014397,
  title = {Exploration of geo-tagged photos through data mining approaches },
  journal = {Expert Systems with Applications },
  volume = {41},
  number = {2},
  pages = {397 - 405},
  year = {2014},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2013.07.065},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417413005526},
  author = {Ickjai Lee and Guochen Cai and Kyungmi Lee},
  keywords = {Clustering},
  keywords = {Association rules mining},
  keywords = {Geo-tagged photo},
  keywords = {Points-of-interest },
  abstract = {Abstract With the development of web technique and social network sites human now can produce information, share with others online easily. Photo-sharing website, Flickr, stores huge number of photos where people upload and share their pictures. This research proposes a framework that is used to extract associative points-of-interest patterns from geo-tagged photos in Queensland, Australia, a popular tourist destination hosting the great Barrier Reef and tropical rain forest. This framework combines two popular data mining techniques: clustering for points-of-interest detection, and association rules mining for associative points-of-interest patterns. We report interesting experimental results and discuss findings. }
}
@article{Paramasivam2014139,
  title = {A methodological review of data mining techniques in predictive medicine: An application in hemodynamic prediction for abdominal aortic aneurysm disease },
  journal = {Biocybernetics and Biomedical Engineering },
  volume = {34},
  number = {3},
  pages = {139 - 145},
  year = {2014},
  note = {},
  issn = {0208-5216},
  doi = {http://dx.doi.org/10.1016/j.bbe.2014.03.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0208521614000266},
  author = {Vijayajothi Paramasivam and Tan Sing Yee and Sarinder K. Dhillon and Amandeep S. Sidhu},
  keywords = {Data mining techniques},
  keywords = {Hemodynamic prediction},
  keywords = {Abdominal aortic aneurysm },
  abstract = {Abstract Modern clinics and hospitals need accurate real-time prediction tools. This paper reviews the importance and present trends of data mining methodologies in predictive medicine by focusing on hemodynamic predictions in abdominal aortic aneurysm (AAA). It also provides potential data mining working frameworks for hemodynamic predictions in AAA. These frameworks either allow the coupling between a typical computational modeling simulation and various data mining techniques, using the existing medical datasets of real-patient and mining it directly using various data mining techniques or implementing visual data mining approach to already available computed results of various hemodynamic features within the \{AAA\} models. These approaches allow the possibility of statistically predicting rupture potentials of aneurismal patients and ideally provide an alternate solution for substituting tedious and time-consuming computational modeling. Prediction trends of patient-specific aneurismal conditions via mining huge volume of medical data can also speed up the decision making process in real life medicine. }
}
@article{Özyirmidokuz2014320,
  title = {Mining Unstructured Turkish Economy News Articles },
  journal = {Procedia Economics and Finance },
  volume = {16},
  number = {0},
  pages = {320 - 328},
  year = {2014},
  note = {21st International Economic Conference of Sibiu 2014, \{IECS\} 2014 Prospects of Economic Recovery in a Volatile International Context: Major Obstacles, Initiatives and Projects },
  issn = {2212-5671},
  doi = {http://dx.doi.org/10.1016/S2212-5671(14)00809-0},
  url = {http://www.sciencedirect.com/science/article/pii/S2212567114008090},
  author = {Esra Kahya Özyirmidokuz},
  keywords = {Knowledge discovery in databases},
  keywords = {Text mining},
  keywords = {Natural language processing },
  abstract = {Abstract Text mining is the analysis of unstructured data by combining techniques from knowledge discovery in databases, natural language processing, information retrieval, and machine learning. Text mining allows us to analyze web content dynamically to find meaningful patterns within large collections of textual data. There are too many economic news articles to read. Therefore, it is a necessary to summarize them. In this study, \{TM\} is used to analyze the vast amount of text produced in newspaper articles in Turkey. We mine unstructured economy news with natural language processing techniques including tokenization, transform cases, filtering stopwords and stemming. Similarity analysis is also used to determine similar documents. The word vector is extracted. Therefore, economy news is structured into numeric representations that summarize them. In addition, k-means clustering is used. Consequently, the clusters and similarities of the articles are obtained. }
}
@article{KrysiakBaltyn2014160,
  title = {Compass: A hybrid method for clinical and biobank data mining },
  journal = {Journal of Biomedical Informatics },
  volume = {47},
  number = {0},
  pages = {160 - 170},
  year = {2014},
  note = {},
  issn = {1532-0464},
  doi = {http://dx.doi.org/10.1016/j.jbi.2013.10.007},
  url = {http://www.sciencedirect.com/science/article/pii/S1532046413001597},
  author = {K. Krysiak-Baltyn and T. Nordahl Petersen and K. Audouze and Niels Jørgensen and L. Ängquist and S. Brunak},
  keywords = {Data mining},
  keywords = {Clinical data},
  keywords = {Rule extraction},
  keywords = {Self-Organizing Map},
  keywords = {Association mining },
  abstract = {Abstract We describe a new method for identification of confident associations within large clinical data sets. The method is a hybrid of two existing methods; Self-Organizing Maps and Association Mining. We utilize Self-Organizing Maps as the initial step to reduce the search space, and then apply Association Mining in order to find association rules. We demonstrate that this procedure has a number of advantages compared to traditional Association Mining; it allows for handling numerical variables without a priori binning and is able to generate variable groups which act as “hotspots” for statistically significant associations. We showcase the method on infertility-related data from Danish military conscripts. The clinical data we analyzed contained both categorical type questionnaire data and continuous variables generated from biological measurements, including missing values. From this data set, we successfully generated a number of interesting association rules, which relate an observation with a specific consequence and the p-value for that finding. Additionally, we demonstrate that the method can be used on non-clinical data containing chemical–disease associations in order to find associations between different phenotypes, such as prostate cancer and breast cancer. }
}
@article{Vimieiro20141,
  title = {A new method for mining disjunctive emerging patterns in high-dimensional datasets using hypergraphs },
  journal = {Information Systems },
  volume = {40},
  number = {0},
  pages = {1 - 10},
  year = {2014},
  note = {},
  issn = {0306-4379},
  doi = {http://dx.doi.org/10.1016/j.is.2013.09.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0306437913001221},
  author = {Renato Vimieiro and Pablo Moscato},
  keywords = {Emerging patterns},
  keywords = {Contrast pattern mining},
  keywords = {Associative classifier},
  keywords = {Minimal transversals},
  keywords = {Hypergraphs},
  keywords = {Microarray data },
  abstract = {Abstract We investigate in this paper the problem of mining disjunctive emerging patterns in high-dimensional biomedical datasets. Disjunctive emerging patterns are sets of features that are very frequent among samples of a target class, cases in a case–control study, for example, and are very rare among all other samples. We, for the very first time, demonstrate that this problem can be solved using minimal transversals in a hypergraph. We propose a new divide-and-conquer algorithm that enables us to efficiently compute disjunctive emerging patterns in parallel and distributed environments. We conducted experiments using real-world microarray gene expression datasets to assess the performance of our approach. Our results show that our approach is more efficient than the state-of-the-art solution available in the literature. In this sense, we contribute to the area of bioinformatics and data mining by providing another useful alternative to identify patterns distinguishing samples with different class labels, such as those in case–control studies, for example. }
}
@article{Aghdaie2014767,
  title = {Synergies of Data Mining and Multiple Attribute Decision Making },
  journal = {Procedia - Social and Behavioral Sciences },
  volume = {110},
  number = {0},
  pages = {767 - 776},
  year = {2014},
  note = {The 2-dn International Scientific conference „Contemporary Issues in Business, Management and Education 2013“ },
  issn = {1877-0428},
  doi = {http://dx.doi.org/10.1016/j.sbspro.2013.12.921},
  url = {http://www.sciencedirect.com/science/article/pii/S1877042813055614},
  author = {Mohammad Hasan Aghdaie and Sarfaraz Hashemkhani Zolfani and Edmundas Kazimieras Zavadskas},
  keywords = {data mining},
  keywords = {Multiple Attribute Decision Making (MADM)},
  keywords = {Clustering},
  keywords = {\{SWARA\} ;VIKOR},
  keywords = {Supplier clustering and ranking },
  abstract = {Abstract Data Mining (DM) and Multiple Attribute Decision Making (MADM) are two fast growing trends in Operations Research (OR)/Management Science (MS). In this article, we identify the synergies of data mining and MADM. Synergies can be attained by integration of \{MADM\} techniques into data mining and vice versa. The primary goal of the paper is to show a wide range of interactions between these two fields from a new perspective with an example of the integrated approach in supplier clustering and ranking. The integrated approach includes cluster analysis as a data mining tool and Step-wise Weight Assessment Ratio Analysis (SWARA) and \{VIseKriterijumskao\} ptimizacija i \{KOmpromisno\} Resenje (VIKOR) as the two \{MADM\} tools. More precisely, the features for clustering were selected and weighted by \{SWARA\} method and suppliers are clustered using two-stage cluster analysis. In addition, \{VIKOR\} technique is used to rank the clusters from the best to the worst one. The proposed integrated approach is presented to demonstrate the applicability of the proposed methodology. }
}
@article{GarcíaBorroto20154859,
  title = {Finding the best diversity generation procedures for mining contrast patterns },
  journal = {Expert Systems with Applications },
  volume = {42},
  number = {11},
  pages = {4859 - 4866},
  year = {2015},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2015.02.028},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417415001359},
  author = {Milton García-Borroto and José Fco. Martínez-Trinidad and Jesús Ariel Carrasco-Ochoa},
  keywords = {Understandable classifiers},
  keywords = {Contrast patterns},
  keywords = {Ensemble diversity},
  keywords = {Deterministic procedures },
  abstract = {Abstract Most understandable classifiers are based on contrast patterns, which can be accurately mined from decision trees. Nevertheless, tree diversity must be ensured to mine a representative pattern collection. In this paper, we performed an experimental comparison among different diversity generation procedures. We compare diversity generated by each procedure based on the amount of total, unique, and minimal patterns extracted from the induced tree for different minimal support thresholds. This comparison, together with an accuracy and abstention experiment, shows that Random Forest and Bagging generate the most diverse and accurate pattern collection. Additionally, we study the influence of data type in the results, finding that Random Forest is best for categorical data and Bagging for numerical data. Comparison includes most known diversity generation procedures and three new deterministic procedures introduced here. These deterministic procedures outperform existing deterministic method, but are still outperformed by random procedures. }
}
@article{Calders2014233,
  title = {Mining frequent itemsets in a stream },
  journal = {Information Systems },
  volume = {39},
  number = {0},
  pages = {233 - 255},
  year = {2014},
  note = {},
  issn = {0306-4379},
  doi = {http://dx.doi.org/10.1016/j.is.2012.01.005},
  url = {http://www.sciencedirect.com/science/article/pii/S0306437912000087},
  author = {Toon Calders and Nele Dexters and Joris J.M. Gillis and Bart Goethals},
  keywords = {Frequent itemset mining},
  keywords = {Datastream},
  keywords = {Theory},
  keywords = {Algorithm},
  keywords = {Experiments },
  abstract = {Mining frequent itemsets in a datastream proves to be a difficult problem, as itemsets arrive in rapid succession and storing parts of the stream is typically impossible. Nonetheless, it has many useful applications; e.g., opinion and sentiment analysis from social networks. Current stream mining algorithms are based on approximations. In earlier work, mining frequent items in a stream under the max-frequency measure proved to be effective for items. In this paper, we extended our work from items to itemsets. Firstly, an optimized incremental algorithm for mining frequent itemsets in a stream is presented. The algorithm maintains a very compact summary of the stream for selected itemsets. Secondly, we show that further compacting the summary is non-trivial. Thirdly, we establish a connection between the size of a summary and results from number theory. Fourthly, we report results of extensive experimentation, both of synthetic and real-world datasets, showing the efficiency of the algorithm both in terms of time and space. }
}
@article{DeAngelis2014720,
  title = {Mining categorical sequences from data using a hybrid clustering method },
  journal = {European Journal of Operational Research },
  volume = {234},
  number = {3},
  pages = {720 - 730},
  year = {2014},
  note = {},
  issn = {0377-2217},
  doi = {http://dx.doi.org/10.1016/j.ejor.2013.11.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0377221713009016},
  author = {Luca De Angelis and José G. Dias},
  keywords = {Data mining},
  keywords = {Sequential data},
  keywords = {Hidden Markov models},
  keywords = {Clustering},
  keywords = {Categorical data },
  abstract = {Abstract The identification of different dynamics in sequential data has become an every day need in scientific fields such as marketing, bioinformatics, finance, or social sciences. Contrary to cross-sectional or static data, this type of observations (also known as stream data, temporal data, longitudinal data or repeated measures) are more challenging as one has to incorporate data dependency in the clustering process. In this research we focus on clustering categorical sequences. The method proposed here combines model-based and heuristic clustering. In the first step, the categorical sequences are transformed by an extension of the hidden Markov model into a probabilistic space, where a symmetric Kullback–Leibler distance can operate. Then, in the second step, using hierarchical clustering on the matrix of distances, the sequences can be clustered. This paper illustrates the enormous potential of this type of hybrid approach using a synthetic data set as well as the well-known Microsoft dataset with website users search patterns and a survey on job career dynamics. }
}
@article{Strohmeier20132410,
  title = {Domain driven data mining in human resource management: A review of current research },
  journal = {Expert Systems with Applications },
  volume = {40},
  number = {7},
  pages = {2410 - 2420},
  year = {2013},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.10.059},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412011839},
  author = {Stefan Strohmeier and Franca Piazza},
  keywords = {Data mining},
  keywords = {\{HRM\}},
  keywords = {Literature review},
  keywords = {Domain driven data mining},
  keywords = {\{HR\} data mining},
  keywords = {Electronic human resource management (e-HRM) },
  abstract = {An increasing number of publications concerning data mining in the subject of human resource management (HRM) indicate the presence of a prospering new research field. The current paper reviews this research on \{HR\} data mining to systematically uncover recent advancements and suggest areas for future work. Based on the approach of domain driven data mining, an initial framework with significant domain-specific requirements is elaborated. Relevant research contributions are identified and reviewed against the background of this framework. The review reveals that \{HRM\} constitutes a noteworthy new domain of data mining research that is dominated by method- and technology-oriented work. However, specific domain requirements, such as evaluating the domain success or complying with legal standards, are frequently not recognized or considered in current research. Therefore, the systematic consideration of domain-specific requirements is demonstrated here to have significant implications for future research on data mining in HRM. }
}
@article{Mohamad2013320,
  title = {Educational Data Mining: A Review },
  journal = {Procedia - Social and Behavioral Sciences },
  volume = {97},
  number = {0},
  pages = {320 - 324},
  year = {2013},
  note = {The 9th International Conference on Cognitive Science },
  issn = {1877-0428},
  doi = {http://dx.doi.org/10.1016/j.sbspro.2013.10.240},
  url = {http://www.sciencedirect.com/science/article/pii/S1877042813036859},
  author = {Siti Khadijah Mohamad and Zaidatun Tasir},
  keywords = {Algorithm},
  keywords = {Data mining},
  keywords = {Educational data mining},
  keywords = {Elearning},
  keywords = {Online interaction },
  abstract = {Abstract Data Mining is very useful in the field of education especially when examining students’ learning behavior in online learning environment. This is due to the potential of data mining in analyzing and uncovering the hidden information of the data itself which is hard and very time consuming if to be done manually. The purpose of this review is to look into how the data mining was tackled by previous scholars and the latest trends on data mining in educational research. Several limitations of existing research are discussed and some directions for future research are suggested. }
}
@article{Braun2014223,
  title = {A Tree-based Algorithm for Mining Diverse Social Entities },
  journal = {Procedia Computer Science },
  volume = {35},
  number = {0},
  pages = {223 - 232},
  year = {2014},
  note = {Knowledge-Based and Intelligent Information & Engineering Systems 18th Annual Conference, KES-2014 Gdynia, Poland, September 2014 Proceedings },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2014.08.102},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050914010679},
  author = {Peter Braun and Alfredo Cuzzocrea and Carson K. Leung and Richard Kyle MacKinnon and Syed K. Tanbeer},
  keywords = {Data mining},
  keywords = {diverse friends},
  keywords = {friendship patterns},
  keywords = {intelligent information & engineering systems},
  keywords = {knowledge based and expert systems},
  keywords = {social computing systems},
  keywords = {social network analysis },
  abstract = {Abstract DiSE-growth, a tree-based (pattern-growth) algorithm for mining \{DIverse\} Social Entities, is proposed and experimentally assessed in this paper. The algorithm makes use of a specialized data structure, called DiSE-tree, for effectively and efficiently representing relevant information on diverse social entities while successfully supporting the mining phase. Diverse entities are popular in a wide spectrum of application scenarios, ranging from linked Web data to Semantic Web and social networks. In all these application scenarios, it has become important to analyze high volumes of valuable linked data and discover those diverse social entities. We complement our analytical contributions by means of an experimental evaluation that clearly shows the benefits of our tree-based diverse social entity mining algorithm. }
}
@incollection{Krallinger201451,
  title = {6.04 - Text Mining },
  editor = {Brahme, Anders },
  booktitle = {Comprehensive Biomedical Physics },
  publisher = {Elsevier},
  edition = {},
  address = {Oxford},
  year = {2014},
  pages = {51 - 66},
  isbn = {978-0-444-53633-4},
  doi = {http://dx.doi.org/10.1016/B978-0-444-53632-7.01107-2},
  url = {http://www.sciencedirect.com/science/article/pii/B9780444536327011072},
  author = {M. Krallinger and F. Leitner and M. Vazquez and A. Valencia},
  keywords = {BioCreative},
  keywords = {Biomedical literature},
  keywords = {Information extraction},
  keywords = {Information retrieval},
  keywords = {Natural language processing},
  keywords = {Text categorization},
  keywords = {Text mining },
  abstract = {Abstract It is becoming increasingly difficult to keep up with the amount of information published in the scientific literature, both for domain experts and for the sake of maintaining up-to-date biological databases based on manual curation of articles. This issue has been addressed with the help of text mining technologies specifically adapted to the biomedical domain. The aim of these strategies is to be more efficient in the retrieval and classification of relevant documents and the detection of bio-entities in text. Text mining is used for the automatic extraction of interactions between and functional annotations of biological substances and links articles with existing objects in the annotation databases. This chapter provides a general overview of the main tasks in biomedical text mining and natural language processing, introducing the underlying methods and existing applications tailored to handle the rapidly growing amount of literature data. }
}
@article{Thomas2014457,
  title = {Studying software evolution using topic models },
  journal = {Science of Computer Programming },
  volume = {80, Part B},
  number = {0},
  pages = {457 - 479},
  year = {2014},
  note = {},
  issn = {0167-6423},
  doi = {http://dx.doi.org/10.1016/j.scico.2012.08.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0167642312001621},
  author = {Stephen W. Thomas and Bram Adams and Ahmed E. Hassan and Dorothea Blostein},
  keywords = {Software evolution},
  keywords = {Topic model},
  keywords = {Latent Dirichlet allocation},
  keywords = {Mining software repositories },
  abstract = {Topic models are generative probabilistic models which have been applied to information retrieval to automatically organize and provide structure to a text corpus. Topic models discover topics in the corpus, which represent real world concepts by frequently co-occurring words. Recently, researchers found topics to be effective tools for structuring various software artifacts, such as source code, requirements documents, and bug reports. This research also hypothesized that using topics to describe the evolution of software repositories could be useful for maintenance and understanding tasks. However, research has yet to determine whether these automatically discovered topic evolutions describe the evolution of source code in a way that is relevant or meaningful to project stakeholders, and thus it is not clear whether topic models are a suitable tool for this task. In this paper, we take a first step towards evaluating topic models in the analysis of software evolution by performing a detailed manual analysis on the source code histories of two well-known and well-documented systems, \{JHotDraw\} and jEdit. We define and compute various metrics on the discovered topic evolutions and manually investigate how and why the metrics evolve over time. We find that the large majority (87%–89%) of topic evolutions correspond well with actual code change activities by developers. We are thus encouraged to use topic models as tools for studying the evolution of a software system. }
}
@article{Jiang201518,
  title = {SG-WSTD: A framework for scalable geographic web search topic discovery },
  journal = {Knowledge-Based Systems },
  volume = {84},
  number = {0},
  pages = {18 - 33},
  year = {2015},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2015.03.020},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705115001148},
  author = {Di Jiang and Jan Vosecky and Kenneth Wai-Ting Leung and Lingxiao Yang and Wilfred Ng},
  keywords = {Topic model},
  keywords = {Search engine},
  keywords = {Information retrieval },
  abstract = {Abstract Search engine query logs are recognized as an important information source that contains millions of users’ web search needs. Discovering Geographic Web Search Topics (G-WSTs) from a query log can support a variety of downstream web applications such as finding commonality between locations and profiling search engine users. However, the task of discovering G-WSTs is nontrivial, not only because of the diversity of the information in web search but also due to the sheer size of query log. In this paper, we propose a new framework, Scalable Geographic Web Search Topic Discovery (SG-WSTD), which contains highly scalable functionalities such as search session derivation, geographic information extraction and geographic web search topic discovery to discover G-WSTs from query log. Within SG-WSTD, two probabilistic topic models are proposed to discover G-WSTs from two complementary perspectives. The first one is the Discrete Search Topic Model (DSTM), which discovers G-WSTs that capture the commonalities between discrete locations. The second is the Regional Search Topic Model (RSTM), which focuses on a specific geographic region on the map and discovers G-WSTs that demonstrate geographic locality. Since query log is typically voluminous, we implement the functionalities in SG-WSTD based on the MapReduce paradigm to solve the efficiency bottleneck. We evaluate SG-WSTD against several strong baselines on a real-life query log from AOL. The proposed framework demonstrates significantly improved data interpretability, better prediction performance, higher topic distinctiveness and superior scalability in the experimentation. }
}
@article{Braun2014338,
  title = {Effectively and Efficiently Mining Frequent Patterns from Dense Graph Streams on Disk },
  journal = {Procedia Computer Science },
  volume = {35},
  number = {0},
  pages = {338 - 347},
  year = {2014},
  note = {Knowledge-Based and Intelligent Information & Engineering Systems 18th Annual Conference, KES-2014 Gdynia, Poland, September 2014 Proceedings },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2014.08.114},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050914010795},
  author = {Peter Braun and Juan J. Cameron and Alfredo Cuzzocrea and Fan Jiang and Carson K. Leung},
  keywords = {Data mining},
  keywords = {frequent pattern mining},
  keywords = {graph streams},
  keywords = {knowledge-based and intelligent information & engineering systems},
  keywords = {knowledge discovery},
  keywords = {limited memory},
  keywords = {stream mining },
  abstract = {Abstract In this paper, we focus on dense graph streams, which can be generated in various applications ranging from sensor networks to social networks, from bio-informatics to chemical informatics. We also investigate the problem of effectively and efficiently mining frequent patterns from such streaming data, in the targeted case of dealing with limited memory environments so that disk support is required. This setting occurs frequently (e.g., in mobile applications/systems) and is gaining momentum even in advanced computational settings where social networks are the main representative. Inspired by this problem, we propose (i) a specialized data structure called DSMatrix, which captures important data from dense graph streams onto the disk directly and (ii) stream mining algorithms that make use of such structure in order to mine frequent patterns effectively and efficiently. Experimental results clearly confirm the benefits of our approach. }
}
@article{Feng2014409,
  title = {Mining user-contributed photos for personalized product recommendation },
  journal = {Neurocomputing },
  volume = {129},
  number = {0},
  pages = {409 - 420},
  year = {2014},
  note = {},
  issn = {0925-2312},
  doi = {http://dx.doi.org/10.1016/j.neucom.2013.09.018},
  url = {http://www.sciencedirect.com/science/article/pii/S0925231213009363},
  author = {He Feng and Xueming Qian},
  keywords = {Products recommender},
  keywords = {Personalized recommendation},
  keywords = {Hierarchical topic space},
  keywords = {Social media },
  abstract = {Abstract With the advent and popularity of social media, users are willing to share their experiences by photos, reviews, blogs, and so on. The social media contents shared by these users reveal potential shopping needs. Product recommender is not limited to just e-commerce sites, it can also be expanded to social media sites. In this paper, we propose a novel hierarchical user interest mining (Huim) approach for personalized products recommendation. The input of our approach consists of user-contributed photos and user generated content (UGC), which include user-annotated photo tags and the comments from others in a social site. The proposed approach consists of four steps. First, we make full use of the visual information and \{UGC\} of its photos to mine user's interest. Second, we represent user interest by a topic distribution vector, and apply our proposed Huim to enhance interest-related topics. Third, we also represent each product by a topic distribution vector. Then, we measure the relevance of user and product in the topic space and determine the rank of each product for the user. We conduct a series of experiments on Flickr users and the products from Bing Shopping. Experimental results show the effectiveness of the proposed approach. }
}
@article{PhridviRaj2014255,
  title = {Data Mining – Past, Present and Future – A Typical Survey on Data Streams },
  journal = {Procedia Technology },
  volume = {12},
  number = {0},
  pages = {255 - 263},
  year = {2014},
  note = {The 7th International Conference Interdisciplinarity in Engineering, INTER-ENG 2013, 10-11 October 2013, Petru Maior University of Tirgu Mures, Romania },
  issn = {2212-0173},
  doi = {http://dx.doi.org/10.1016/j.protcy.2013.12.483},
  url = {http://www.sciencedirect.com/science/article/pii/S2212017313006683},
  author = {M.S.B. PhridviRaj and C.V. GuruRao},
  keywords = {Clustering},
  keywords = {Streams},
  keywords = {Mining},
  keywords = {Dimensionality reduction},
  keywords = {Text stream},
  keywords = {Data streams },
  abstract = {Abstract Data Stream Mining is one of the area gaining lot of practical significance and is progressing at a brisk pace with new methods, methodologies and findings in various applications related to medicine, computer science, bioinformatics and stock market prediction, weather forecast, text, audio and video processing to name a few. Data happens to be the key concern in data mining. With the huge online data generated from several sensors, Internet Relay Chats, Twitter, Face book, Online Bank or \{ATM\} Transactions, the concept of dynamically changing data is becoming a key challenge, what we call as data streams. In this paper, we give the algorithm for finding frequent patterns from data streams with a case study and identify the research issues in handling data streams. }
}
@article{Zheng20157110,
  title = {Predictive modeling of hospital readmissions using metaheuristics and data mining },
  journal = {Expert Systems with Applications },
  volume = {42},
  number = {20},
  pages = {7110 - 7120},
  year = {2015},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2015.04.066},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417415003085},
  author = {Bichen Zheng and Jinghe Zhang and Sang Won Yoon and Sarah S. Lam and Mohammad Khasawneh and Srikanth Poranki},
  keywords = {Neural networks},
  keywords = {Support vector machine},
  keywords = {Particle swarm optimization},
  keywords = {Hospital readmission},
  keywords = {Risk prediction },
  abstract = {Abstract This research studies the risk prediction of hospital readmissions using metaheuristic and data mining approaches. This is a critical issue in the U.S. healthcare system because a large percentage of preventable hospital readmissions derive from a low quality of care during patients’ stays in the hospital as well as poor arrangement of the discharge process. To reduce the number of hospital readmissions, the Centers for Medicare and Medicaid Services has launched a readmission penalty program in which hospitals receive reduced reimbursement for high readmission rates for Medicare beneficiaries. In the current practice, patient readmission risk is widely assessed by evaluating a \{LACE\} score including length of stay (L), acuity level of admission (A), comorbidity condition (C), and use of emergency rooms (E). However, the \{LACE\} threshold classifying high- and low-risk readmitted patients is set up by clinic practitioners based on specific circumstances and experiences. This research proposed various data mining approaches to identify the risk group of a particular patient, including neural network model, random forest (RF) algorithm, and the hybrid model of swarm intelligence heuristic and support vector machine (SVM). The proposed neural network algorithm, the \{RF\} and the \{SVM\} classifiers are used to model patients’ characteristics, such as their ages, insurance payers, medication risks, etc. Experiments are conducted to compare the performance of the proposed models with previous research. Experimental results indicate that the proposed prediction \{SVM\} model with particle swarm parameter tuning outperforms other algorithms and achieves 78.4% on overall prediction accuracy, 97.3% on sensitivity. The high sensitivity shows its strength in correctly identifying readmitted patients. The outcome of this research will help reduce overall hospital readmission rates and allow hospitals to utilize their resources more efficiently to enhance interventions for high-risk patients. }
}
@article{Chen201324,
  title = {Application of data mining in a global optimization algorithm },
  journal = {Advances in Engineering Software },
  volume = {66},
  number = {0},
  pages = {24 - 33},
  year = {2013},
  note = {Civil-Comp Conference Special Issue },
  issn = {0965-9978},
  doi = {http://dx.doi.org/10.1016/j.advengsoft.2012.11.019},
  url = {http://www.sciencedirect.com/science/article/pii/S0965997812001743},
  author = {T.Y. Chen and J.H. Huang},
  keywords = {Global optimization algorithm},
  keywords = {Data mining},
  keywords = {Evolution strategy},
  keywords = {Sequential quadratic programming},
  keywords = {Reduced search space},
  keywords = {Hybrid search method },
  abstract = {A hybrid global optimization algorithm is developed in this research. The probability of finding the global optimal solution is increased by reducing the search space. The activities of classification, association, and clustering in data mining are employed to achieve this purpose. The hybrid algorithm developed uses data mining (DM), evolution strategy (ES) and sequential quadratic programming (SQP) to search for the global optimal solution. For unconstrained optimization problems, data mining techniques are used to determine a smaller search region that contains the global solution. For constrained optimization problems, the data mining techniques are used to find the approximate feasible region or the feasible region with better objective values. Numerical examples demonstrate that this hybrid algorithm can effectively find the global optimal solutions for two benchmark test problems. }
}
@incollection{Nguyen201495,
  title = {Chapter 4 - Text Mining and Network Analysis of Digital Libraries in R },
  editor = {Cen, Yanchang ZhaoYonghua },
  booktitle = {Data Mining Applications with R },
  publisher = {Academic Press},
  edition = {},
  address = {Boston},
  year = {2014},
  pages = {95 - 115},
  isbn = {978-0-12-411511-8},
  doi = {http://dx.doi.org/10.1016/B978-0-12-411511-8.00004-9},
  url = {http://www.sciencedirect.com/science/article/pii/B9780124115118000049},
  author = {Eric Nguyen},
  keywords = {Text mining},
  keywords = {Latent Dirichlet allocation},
  keywords = {Social networking analysis},
  keywords = {Betweenness centrality },
  abstract = {Abstract With the availability of open and structured data, digital libraries become an important source of data in recent data mining techniques. The inherent structure of digital libraries comes with information about date, authorship, involved institutions, geographic context, and large volumes of text. As an illustration of digital libraries analysis, we will focus on a space of scientific publications related to some science discipline and extract patterns and text-related information from the dataset. Text mining techniques offer a reliable and efficient way to quantify many aspects of digital libraries, as well as methods to cluster and regroup digital content in relation to topical content. This chapter will discuss text preparation techniques in R, the use of the latent Dirichlet allocation to classify content, and the analysis of text features, like co-occurrence matrices. Simple similarity measures between authors and between papers will be used to illustrate cluster cohesion within the dataset. }
}
@incollection{Cohen2014141,
  title = {Chapter 6 - Biomedical Natural Language Processing and Text Mining },
  editor = {Sarkar, Indra Neil },
  booktitle = {Methods in Biomedical Informatics },
  publisher = {Academic Press},
  edition = {},
  address = {Oxford},
  year = {2014},
  pages = {141 - 177},
  isbn = {978-0-12-401678-1},
  doi = {http://dx.doi.org/10.1016/B978-0-12-401678-1.00006-3},
  url = {http://www.sciencedirect.com/science/article/pii/B9780124016781000063},
  author = {Kevin Bretonnel Cohen},
  keywords = {Natural language processing},
  keywords = {Text mining},
  keywords = {BioNLP },
  abstract = {Abstract Natural language processing and text mining (“BioNLP”) are branches of biomedical informatics that deal with processing prose, whether in journal articles or electronic medical records, for purposes such as extracting information, cohort retrieval, and other uses. They are made difficult by the rampant presence of ambiguity and variability in human-produced prose. In addition, biomedical text poses special challenges on a number of levels. Machine learning and rule-based approaches both have a long history in biomedical natural language processing, and hybrid systems are common. Much progress has been made in biomedical natural language processing and text mining in recent years, and the field is poised for explosive growth as new resources should become available in the near future. Many open opportunities for research remain. }
}
@article{Khwaldeh2013232,
  title = {Atomic Data Mining Numerical Methods, Source Code \{SQlite\} with Python },
  journal = {Procedia - Social and Behavioral Sciences },
  volume = {73},
  number = {0},
  pages = {232 - 239},
  year = {2013},
  note = {Proceedings of the 2nd International Conference on Integrated Information (IC-ININFO 2012), Budapest, Hungary, August 30 – September 3, 2012 },
  issn = {1877-0428},
  doi = {http://dx.doi.org/10.1016/j.sbspro.2013.02.046},
  url = {http://www.sciencedirect.com/science/article/pii/S187704281300339X},
  author = {Ali Khwaldeh and Amani Tahat and Jordi Marti and Mofleh Tahat},
  keywords = {Python},
  keywords = {atomic data},
  keywords = {database},
  keywords = {data mining algorithms},
  keywords = {data model},
  keywords = {collaborative intelligence},
  keywords = {machine learning },
  abstract = {This paper introduces a recently published Python data mining book (chapters, topics, samples of Python source code written by its authors) to be used in data mining via world wide web and any specific database in several disciplines (economic, physics, education, marketing. etc). The book started with an introduction to data mining by explaining some of the data mining tasks involved classification, dependence modelling, clustering and discovery of association rules. The book addressed that using Python in data mining has been gaining some interest from data miner community due to its open source, general purpose programming and web scripting language; furthermore, it is a cross platform and it can be run on a wide variety of operating systens such as Linux, Windows, FreeBSD, Macintosh, Solaris, OS/2, Amiga, AROS, AS/400, BeOS, OS/390, z/OS, Palm OS, QNX, VMS, Psion, Acorn \{RISC\} OS, VxWorks, PlayStation, Sharp Zaurus, Windows \{CE\} and even PocketPC. Finally this book can be considered as a teaching textbook for data mining in which several methods such as machine learning and statistics are used to extract high-level knowledge from real-world datasets. }
}
@article{Yakushev20142462,
  title = {Social Networks Mining for Analysis and Modeling Drugs Usage },
  journal = {Procedia Computer Science },
  volume = {29},
  number = {0},
  pages = {2462 - 2471},
  year = {2014},
  note = {2014 International Conference on Computational Science },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2014.05.230},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050914004074},
  author = {Andrei Yakushev and Sergey Mityagin},
  keywords = {social media},
  keywords = {data mining},
  keywords = {gig data},
  keywords = {illicit drug use},
  keywords = {map reduce},
  keywords = {clavire},
  keywords = {feature selection },
  abstract = {Abstract This paper presents approach for mining and analysis of data from social media which is based on using Map Reduce model for processing big amounts of data and on using composite applications for performing more sophisticated analysis which are executed on environment for distributed computing- based cloud platform. We applied this system for creation characteristics of users who write about drugs and to estimate factors that can be used as part of model for prediction drug usage level in real world. We propose to use social media as an additional data source which complement official data sources for analysis and modeling illegal activities in society. }
}
@article{Mrázová2014308,
  title = {Mining the Czech Insolvency Proceedings Data },
  journal = {Procedia Computer Science },
  volume = {36},
  number = {0},
  pages = {308 - 313},
  year = {2014},
  note = {Complex Adaptive Systems Philadelphia, \{PA\} November 3-5, 2014 },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2014.09.098},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050914013477},
  author = {Iveta Mrázová and Peter Zvirinský},
  keywords = {data mining},
  keywords = {knowledge extraction},
  keywords = {data pre-processing},
  keywords = {social network analysis},
  keywords = {community discovery },
  abstract = {Abstract The Global Financial Crisis of 2008 has left behind it many victims worldwide – both among bankrupt companies and indebted people with a grim future ahead. On January 1, 2008, the government of the Czech Republic launched a new information system called Insolvency Register of the Czech Republic. Meanwhile, the Czech Insolvency Register contains publicly available data concerning more than 100 000 insolvency proceedings. Modern data mining methods quite naturally represent an appealing approach to analyze these huge amounts of open source data. In this context, especially the techniques of Bayesian networks and social network analysis seem to reveal several new socio-economic patterns present in the current Czech society. }
}
@article{Wu20151,
  title = {Modeling query-document dependencies with topic language models for information retrieval },
  journal = {Information Sciences },
  volume = {312},
  number = {0},
  pages = {1 - 12},
  year = {2015},
  note = {},
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2015.03.056},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025515002212},
  author = {Meng-Sung Wu},
  keywords = {Topic model},
  keywords = {Information retrieval},
  keywords = {Query-document relevance},
  keywords = {Latent Dirichlet allocation },
  abstract = {Abstract This paper addresses deficiencies in current information retrieval models by integrating the concept of relevance into the generation model using various topical aspects of the query. The models are adapted from the latent Dirichlet allocation model, but differ in the way that the notation of query-document relevance is introduced in the modeling framework. In the first method, query terms are added to relevant documents in the training of the latent Dirichlet allocation model. In the second method, the latent Dirichlet allocation model is expanded to deal with relevant query terms. The topic of each term within a given document may be sampled using either the normal document-specific mixture weights in \{LDA\} using query-specific mixture weights. We also developed an efficient method based on the Gibbs sampling technique for parameter estimation. Experiment results based on the Text \{REtrieval\} Conference Corpus (TREC) demonstrate the superiority of the proposed models. }
}
@article{Chen201582,
  title = {Building bridges across electronic health record systems through inferred phenotypic topics },
  journal = {Journal of Biomedical Informatics },
  volume = {55},
  number = {0},
  pages = {82 - 93},
  year = {2015},
  note = {},
  issn = {1532-0464},
  doi = {http://dx.doi.org/10.1016/j.jbi.2015.03.011},
  url = {http://www.sciencedirect.com/science/article/pii/S1532046415000544},
  author = {You Chen and Joydeep Ghosh and Cosmin Adrian Bejan and Carl A. Gunter and Siddharth Gupta and Abel Kho and David Liebovitz and Jimeng Sun and Joshua Denny and Bradley Malin},
  keywords = {Clinical phenotype modeling},
  keywords = {Computers and information processing},
  keywords = {Data mining},
  keywords = {Electronic medical records},
  keywords = {Medical information systems},
  keywords = {Pattern recognition },
  abstract = {AbstractObjective Data in electronic health records (EHRs) is being increasingly leveraged for secondary uses, ranging from biomedical association studies to comparative effectiveness. To perform studies at scale and transfer knowledge from one institution to another in a meaningful way, we need to harmonize the phenotypes in such systems. Traditionally, this has been accomplished through expert specification of phenotypes via standardized terminologies, such as billing codes. However, this approach may be biased by the experience and expectations of the experts, as well as the vocabulary used to describe such patients. The goal of this work is to develop a data-driven strategy to (1) infer phenotypic topics within patient populations and (2) assess the degree to which such topics facilitate a mapping across populations in disparate healthcare systems. Methods We adapt a generative topic modeling strategy, based on latent Dirichlet allocation, to infer phenotypic topics. We utilize a variance analysis to assess the projection of a patient population from one healthcare system onto the topics learned from another system. The consistency of learned phenotypic topics was evaluated using (1) the similarity of topics, (2) the stability of a patient population across topics, and (3) the transferability of a topic across sites. We evaluated our approaches using four months of inpatient data from two geographically distinct healthcare systems: (1) Northwestern Memorial Hospital (NMH) and (2) Vanderbilt University Medical Center (VUMC). Results The method learned 25 phenotypic topics from each healthcare system. The average cosine similarity between matched topics across the two sites was 0.39, a remarkably high value given the very high dimensionality of the feature space. The average stability of \{VUMC\} and \{NMH\} patients across the topics of two sites was 0.988 and 0.812, respectively, as measured by the Pearson correlation coefficient. Also the \{VUMC\} and \{NMH\} topics have smaller variance of characterizing patient population of two sites than standard clinical terminologies (e.g., ICD9), suggesting they may be more reliably transferred across hospital systems. Conclusions Phenotypic topics learned from \{EHR\} data can be more stable and transferable than billing codes for characterizing the general status of a patient population. This suggests that EHR-based research may be able to leverage such phenotypic topics as variables when pooling patient populations in predictive models. }
}
@article{Zhou2012810,
  title = {An Outlier Mining Algorithm Based on Dissimilarity },
  journal = {Procedia Environmental Sciences },
  volume = {12, Part B},
  number = {0},
  pages = {810 - 814},
  year = {2012},
  note = {2011 International Conference of Environmental Science and Engineering },
  issn = {1878-0296},
  doi = {http://dx.doi.org/10.1016/j.proenv.2012.01.352},
  url = {http://www.sciencedirect.com/science/article/pii/S1878029612003532},
  author = {Ming-jian Zhou and Xue-jiao Chen},
  keywords = {Outlier},
  keywords = {dissimilarity},
  keywords = {data mining},
  keywords = {algorithm },
  abstract = {Outlier mining is a hot topic of data mining. After studying the commonly used outlier mining methods, this paper presents an outlier mining algorithm OMABD(Outlier Mining Algorithm Base on Dissimilarity) based on dissimilarity. The algorithm first constructs dissimilarity matrix based on object dissimilarity of each object of data set, then makes the dissimilarity degree of each object according to the dissimilarity matrix, and finally outlier will be detected by comparing the dissimilarity degree with dissimilarity threshold. The experiment results show that this algorithm can detect outlier efficiently. }
}
@article{Panasyuk2014160,
  title = {Controversial Topic Discovery on Members of Congress with Twitter },
  journal = {Procedia Computer Science },
  volume = {36},
  number = {0},
  pages = {160 - 167},
  year = {2014},
  note = {Complex Adaptive Systems Philadelphia, \{PA\} November 3-5, 2014 },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2014.09.073},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050914013222},
  author = {Aleksey Panasyuk and Edmund Szu-Li Yu and Kishan G. Mehrotra},
  keywords = {Twitter},
  keywords = {Latent Dirichlet Allocation},
  keywords = {Topic Modeling},
  keywords = {Polarizing Topics},
  keywords = {Semantic Extraction},
  keywords = {Social Media Mining },
  abstract = {Abstract This paper addresses how Twitter can be used for identifying conflict between communities of users. We aggregate documents by topic and by community and perform sentiment analysis, which allows us to analyze the overall opinion of each community about each topic. We rank the topics with opposing views (negative for one community and positive for the other). For illustration of the proposed methodology we chose a problem whose results can be evaluated using news articles. We look at tweets for republican and democrat congress members for the 112th House of Representatives from September to December 2013 and demonstrate that our approach is successful by comparing against articles in the news media. }
}
@article{Wang20133518,
  title = {Implicit feature identification via hybrid association rule mining },
  journal = {Expert Systems with Applications },
  volume = {40},
  number = {9},
  pages = {3518 - 3531},
  year = {2013},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.12.060},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412013012},
  author = {Wei Wang and Hua Xu and Wei Wan},
  keywords = {Opinion mining},
  keywords = {Implicit features},
  keywords = {Hybrid association rule mining},
  keywords = {Collocation extraction },
  abstract = {In sentiment analysis, a finer-grained opinion mining method not only focuses on the view of the product itself, but also focuses on product features, which can be a component or attribute of the product. Previous related research mainly relied on explicit features but ignored implicit features. However, the implicit features, which are implied by some words or phrases, are so significant that they can express the users’ opinion and help us to better understand the users’ comments. It is a big challenge to detect these implicit features in Chinese product reviews, due to the complexity of Chinese. This paper is mainly centered on implicit features identification in Chinese product reviews. A novel hybrid association rule mining method is proposed for this task. The core idea of this approach is mining as many association rules as possible via several complementary algorithms. Firstly, we extract candidate feature indicators based word segmentation, part-of-speech (POS) tagging and feature clustering, then compute the co-occurrence degree between the candidate feature indicators and the feature words using five collocation extraction algorithms. Each indicator and the corresponding feature word constitute a rule (feature indicator → feature word). The best rules in five different rule sets are chosen as the basic rules. Next, three methods are proposed to mine some possible reasonable rules from the lower co-occurrence feature indicators and non indicator words. Finally, the latest rules are used to identify implicit features and the results are compared with the previous. Experiment results demonstrate that our proposed approach is competent at the task, especially via using several expanding methods. The recall is effectively improved, suggesting that the shortcomings of the basic rules have been overcome to certain extent. Besides those high co-occurrence degree indicators, the final rules also contain uncommon rules. }
}
@article{Zeng2015293,
  title = {Clinical data mining },
  journal = {Computers in Biology and Medicine },
  volume = {62},
  number = {0},
  pages = {293 - },
  year = {2015},
  note = {},
  issn = {0010-4825},
  doi = {http://dx.doi.org/10.1016/j.compbiomed.2015.05.014},
  url = {http://www.sciencedirect.com/science/article/pii/S001048251500181X},
  author = {Qing Treitler Zeng and Samah Fodeh}
}
@article{Harrag2014558,
  title = {Text mining approach for knowledge extraction in Sahîh Al-Bukhari },
  journal = {Computers in Human Behavior },
  volume = {30},
  number = {0},
  pages = {558 - 566},
  year = {2014},
  note = {},
  issn = {0747-5632},
  doi = {http://dx.doi.org/10.1016/j.chb.2013.06.035},
  url = {http://www.sciencedirect.com/science/article/pii/S0747563213002276},
  author = {Fouzi Harrag},
  keywords = {Text mining},
  keywords = {Information extraction},
  keywords = {Named entity extraction},
  keywords = {Prophetic narrations texts},
  keywords = {Finite state transducer },
  abstract = {Abstract The areas of information retrieval (IR) and information extraction (IE) are the subject of active research for several years in the community of Artificial Intelligence and Text Mining. With the appearance of large textual corpora in the recent years, we felt the need to integrate modules for information extraction in the existing information retrieval systems. The processing of large textual corpora leads needs that are situated at the border of information extraction and information retrieval areas. Our work in this paper, focus on the extraction of the surface information, i.e. information that not requires complex linguistic processing to be categorized. The goal is to detect and extract passages or sequences of words containing relevant information from the prophetic narrations texts. We propose Finite state transducers-based system that solves successively the problem of texts comprehension. Experimental evaluation results demonstrated that our approach is feasible. Our system achieved encouraging precision and recall rates, the overall precision and recall are 71% and 39% respectively. }
}
@article{Ding2013315,
  title = {A fast malware detection algorithm based on objective-oriented association mining },
  journal = {Computers & Security },
  volume = {39, Part B},
  number = {0},
  pages = {315 - 324},
  year = {2013},
  note = {},
  issn = {0167-4048},
  doi = {http://dx.doi.org/10.1016/j.cose.2013.08.008},
  url = {http://www.sciencedirect.com/science/article/pii/S0167404813001259},
  author = {Yuxin Ding and Xuebing Yuan and Ke Tang and Xiao Xiao and Yibin Zhang},
  keywords = {Malware detection},
  keywords = {Objective-oriented associate mining},
  keywords = {Security},
  keywords = {Classification},
  keywords = {Machine learning },
  abstract = {Abstract Objective-oriented association (OOA) mining has been successfully applied in malware detection. One problem of \{OOA\} mining is that the number of association rules is very large, and many of the rules are redundant and have little capacity to distinguish malware from benign files. This circumstance seriously affects the running speed of \{OOA\} for malware detection. In this paper, an \{API\} (Application Programming Interface)-based association mining method is proposed for detecting malware. To increase the detection speed of the OOA, different strategies are presented: to improve the rule quality, criteria for \{API\} selection are proposed to remove \{APIs\} that cannot become frequent items; to find association rules that have strong discrimination power, we define the rule utility to evaluate the association rules; and to improve the detection accuracy, a classification method based on multiple association rules is adopted. The experiments show that the proposed strategies can significantly improve the running speed of OOA. In our experiments the time cost for data mining is reduced by thirty-two percent, and the time cost for classification is reduced by fifty percent. }
}
@article{AlHassan2013540,
  title = {A research case study: Difficulties and recommendations when using a textual data mining tool },
  journal = {Information & Management },
  volume = {50},
  number = {7},
  pages = {540 - 552},
  year = {2013},
  note = {},
  issn = {0378-7206},
  doi = {http://dx.doi.org/10.1016/j.im.2013.05.010},
  url = {http://www.sciencedirect.com/science/article/pii/S037872061300061X},
  author = {Abeer A. Al-Hassan and Faleh Alshameri and Edgar H. Sibley},
  keywords = {Corporate websites},
  keywords = {Legal statements},
  keywords = {Policy statements},
  keywords = {Terms of Use},
  keywords = {Textual data mining},
  keywords = {Clustering},
  keywords = {Industry classification},
  keywords = {\{NAICS\}},
  keywords = {\{SIC\}},
  keywords = {Privacy statement },
  abstract = {Abstract Although many interesting results have been reported by researchers using numeric data mining methods, there are still questions that need answering before textual data mining tools will be considered generally useful due to the effort needed to learn and use them. In 2011, we generated a dataset from the legal statements (mainly privacy policy and terms of use) on the websites of 475 of the \{US\} Fortune 500 Companies and used it as input to see what we could detect about the organizational relationships between the companies by using a textual data mining tool. We hoped to find that the tool would cluster similar corporations into the same industrial sector, as validated by the company's self-reported North American Industry Classification System code (NAICS). Unfortunately, this proved only marginally successful, leading us to ask why and to pose our research question: What problems occur when a data-mining tool is used to analyze large textual datasets that are unstructured, complex, duplicative, and contain many homonyms and synonyms? In analyzing our large dataset we learned a great deal about the problem and fortunately, after significant effort, determined how to “massage” the raw dataset to improve the process and learn how the tool can be better used in research situations. We also found that NAICS, as self-reported by companies, are of dubious value to a researcher—a matter briefly discussed. }
}
@article{Tsai20133160,
  title = {Knowledge management vs. data mining: Research trend, forecast and citation approach },
  journal = {Expert Systems with Applications },
  volume = {40},
  number = {8},
  pages = {3160 - 3173},
  year = {2013},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.12.029},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412012705},
  author = {Hsu-Hao Tsai},
  keywords = {Knowledge management},
  keywords = {Data mining},
  keywords = {Research trend analysis},
  keywords = {Bibliometric methodology },
  abstract = {Knowledge management (KM) and data mining (DM) have become more important today, however, there are few comprehensive researches and categorization schemes to discuss the characteristics for both of them. Using a bibliometric approach, this paper analyzes \{KM\} and \{DM\} research trends, forecasts and citations from 1989 to 2009 by locating headings “knowledge management” and “data mining” in topics in the \{SSCI\} database. The bibliometric analytical technique was used to examine these two topics in \{SSCI\} journals from 1989 to 2009, we found 1393 articles with \{KM\} and 1181 articles with DM. This paper implemented and classified \{KM\} and \{DM\} articles using the following eight categories—publication year, citation, country/territory, document type, institute name, language, source title and subject area—for different distribution status in order to explore the differences and how \{KM\} and \{DM\} technologies have developed in this period and to analyze \{KM\} and \{DM\} technology tendencies under the above result. Also, the paper performs the K–S test to check whether the distribution of author article production follows Lotka’s law. The research findings can be extended to investigate author productivity by analyzing variables such as chronological and academic age, number and frequency of previous publications, access to research grants, job status, etc. In such a way characteristics of high, medium and low publishing activity of authors can be identified. Besides, these findings will also help to judge scientific research trends and understand the scale of development of research in \{KM\} and \{DM\} through comparing the increases of the article author. Based on the above information, governments and enterprises may infer collective tendencies and demands for scientific researcher in \{KM\} and \{DM\} to formulate appropriate training strategies and policies in the future. This analysis provides a roadmap for future research, abstracts technology trend information and facilitates knowledge accumulations, therefore the future research can concentrated in core categories. This implies that the phenomenon “success breeds success” is more common in higher quality publications. }
}
@article{Çokpınar20127503,
  title = {Positive and negative association rule mining on \{XML\} data streams in database as a service concept },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {8},
  pages = {7503 - 7511},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.01.128},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412001467},
  author = {Samet Çokpınar and Taflan İmre Gündem},
  keywords = {Association rule mining},
  keywords = {Data streams},
  keywords = {Databases },
  abstract = {In recent years, data mining has become one of the most popular techniques for data owners to determine their strategies. Association rule mining is a data mining approach that is used widely in traditional databases and usually to find the positive association rules. However, there are some other challenging rule mining topics like data stream mining and negative association rule mining. Besides, organizations want to concentrate on their own business and outsource the rest of their work. This approach is named “database as a service concept” and provides lots of benefits to data owner, but, at the same time, brings out some security problems. In this paper, a rule mining system has been proposed that provides efficient and secure solution to positive and negative association rule computation on \{XML\} data streams in database as a service concept. The system is implemented and several experiments have been done with different synthetic data sets to show the performance and efficiency of the proposed system. }
}
@article{Perner201419,
  title = {Mining Sparse and Big Data by Case-based Reasoning },
  journal = {Procedia Computer Science },
  volume = {35},
  number = {0},
  pages = {19 - 33},
  year = {2014},
  note = {Knowledge-Based and Intelligent Information & Engineering Systems 18th Annual Conference, KES-2014 Gdynia, Poland, September 2014 Proceedings },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2014.08.081},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050914010461},
  author = {Petra Perner},
  keywords = {Sparse Data Mining},
  keywords = {Big Data Mining},
  keywords = {Case-Based Reasoning},
  keywords = {Similarity Measure},
  keywords = {Data Mining},
  keywords = {Novelty Detection},
  keywords = {Image Processing },
  abstract = {Abstract The increasing use of digital media in daily life has resulted in a need for novel multimedia data analysis techniques. Case-based Reasoning (CBR) solves problems using the already stored knowledge, and captures new knowledge, making it immediately available for solving the next problem. Therefore, case-based reasoning can be seen as a method for problem solving, and also as a method to capture new experience and make it immediately available for problem solving. Therefore, \{CBR\} can mine sparse and big data. It can be seen as a learning and knowledge-discovery approach, since it can capture from new experience some general knowledge, such as case classes, prototypes and some higher-level concept. In this talk, we will explain the case-based reasoning process scheme. We will show what kinds of methods are necessary to provide all the functions for such a computer model. We will develop the bridge between \{CBR\} and Statistics and show how case- based reasoning can mine big and sparse data. Examples are being given based on multimedia applications. Finally, we will show recent new developments and we will give an outline for further work. }
}
@article{Chen2014663,
  title = {Finding keywords in blogs: Efficient keyword extraction in blog mining via user behaviors },
  journal = {Expert Systems with Applications },
  volume = {41},
  number = {2},
  pages = {663 - 670},
  year = {2014},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2013.07.091},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417413005927},
  author = {Yi-Hui Chen and Eric Jui-Lin Lu and Meng Fang Tsai},
  keywords = {Blog mining},
  keywords = {User intention},
  keywords = {Co-keyword},
  keywords = {Blog Connect},
  keywords = {Full-text keyword retrieval procedure },
  abstract = {Abstract Readers are becoming accustomed to obtaining useful and reliable information from bloggers. To make access to the vastly increasing resource of blogs more effective, clustering is useful. Results of the literature review suggest that using linking information, keywords, or tags/categories to calculate similarity is critical for clustering. Keywords are commonly retrieved from the full text, which can be a time-consuming task if multiple articles must be processed. For tags/categories, there is also a problem of ambiguity; that is, different bloggers may define tags/categories of identical content differently. Keywords are important not only to reflect the theme of an article through blog readers’ perspectives but also to accurately match users’ intentions. In this paper, a tracing code is embedded in Blog Connect, a newly developed platform, to collect the keywords queried by readers and then select candidate keywords as co-keywords. The experiments show positive data to confirm that co-keywords can act as a quick path to an article. In addition, co-keyword generation can reduce the complexity and redundancy of full-text keyword retrieval procedures and satisfy blog readers’ intentions. }
}
@article{Thorleuchter20133961,
  title = {Web mining based extraction of problem solution ideas },
  journal = {Expert Systems with Applications },
  volume = {40},
  number = {10},
  pages = {3961 - 3969},
  year = {2013},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2013.01.013},
  url = {http://www.sciencedirect.com/science/article/pii/S095741741300016X},
  author = {D. Thorleuchter and D. Van den Poel},
  keywords = {Web mining},
  keywords = {R&D planning},
  keywords = {Idea mining},
  keywords = {Text mining },
  abstract = {The internet is a valuable source of information where many ideas can be found dealing with different topics. A few numbers of ideas might be able to solve an existing problem. However, it is time-consuming to identify these ideas within the large amount of textual information in the internet. This paper introduces a new web mining approach that enables an automated identification of new technological ideas extracted from internet sources that are able to solve a given problem. It adapts and combines several existing approaches from literature: approaches that extract new technological ideas from a user given text, approaches that investigate the different idea characteristics in different technical domains, and multi-language web mining approaches. In contrast to previous work, the proposed approach enables the identification of problem solution ideas in the internet considering domain dependencies and language aspects. In a case study, new ideas are identified to solve existing technological problems as occurred in research and development (R&D) projects. This supports the process of research planning and technology development. }
}
@article{Sun2015444,
  title = {Mining affective text to improve social media item recommendation },
  journal = {Information Processing & Management },
  volume = {51},
  number = {4},
  pages = {444 - 457},
  year = {2015},
  note = {},
  issn = {0306-4573},
  doi = {http://dx.doi.org/10.1016/j.ipm.2014.09.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0306457314000879},
  author = {Jianshan Sun and Gang Wang and Xusen Cheng and Yelin Fu},
  keywords = {Social media},
  keywords = {Recommender system},
  keywords = {Sentiment classification},
  keywords = {\{OCCF\} },
  abstract = {Abstract Social media websites, such as YouTube and Flicker, are currently gaining in popularity. A large volume of information is generated by online users and how to appropriately provide personalized content is becoming more challenging. Traditional recommendation models are overly dependent on preference ratings and often suffer from the problem of “data sparsity”. Recent research has attempted to integrate sentiment analysis results of online affective texts into recommendation models; however, these studies are still limited. The one class collaborative filtering (OCCF) method is more applicable in the social media scenario yet it is insufficient for item recommendation. In this study, we develop a novel sentiment-aware social media recommendation framework, referred to as SA_OCCF, in order to tackle the above challenges. We leverage inferred sentiment feedback information and \{OCCF\} models to improve recommendation performance. We conduct comprehensive experiments on a real social media web site to verify the effectiveness of the proposed framework and methods. The results show that the proposed methods are effective in improving the performance of the baseline \{OCCF\} methods. }
}
@article{Shie201212947,
  title = {Efficient algorithms for mining maximal high utility itemsets from data streams with different models },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {17},
  pages = {12947 - 12960},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.05.035},
  url = {http://www.sciencedirect.com/science/article/pii/S095741741200749X},
  author = {Bai-En Shie and Philip S. Yu and Vincent S. Tseng},
  keywords = {High utility itemset},
  keywords = {Maximal pattern},
  keywords = {Utility mining},
  keywords = {Data stream mining },
  abstract = {Data stream mining is an emerging research topic in the data mining field. Finding frequent itemsets is one of the most important tasks in data stream mining with wide applications like online e-business and web click-stream analysis. However, two main problems existed in relevant studies: (1) The utilities (e.g., importance or profits) of items are not considered. Actual utilities of patterns cannot be reflected in frequent itemsets. (2) Existing utility mining methods produce too many patterns and this makes it difficult for the users to filter useful patterns among the huge set of patterns. In view of this, in this paper we propose a novel framework, named \{GUIDE\} (Generation of maximal high Utility Itemsets from Data strEams), to find maximal high utility itemsets from data streams with different models, i.e., landmark, sliding window and time fading models. The proposed structure, named MUI-Tree (Maximal high Utility Itemset Tree), maintains essential information for the mining processes and the proposed strategies further facilitates the performance of GUIDE. Main contributions of this paper are as follows: (1) To the best of our knowledge, this is the first work on mining the compact form of high utility patterns from data streams; (2) \{GUIDE\} is an effective one-pass framework which meets the requirements of data stream mining; (3) \{GUIDE\} generates novel patterns which are not only high utility but also maximal, which provide compact and insightful hidden information in the data streams. Experimental results show that our approach outperforms the state-of-the-art algorithms under various conditions in data stream environments on different models. }
}
@article{Velásquez20135228,
  title = {Web mining and privacy concerns: Some important legal issues to be consider before applying any data and information extraction technique in web-based environments },
  journal = {Expert Systems with Applications },
  volume = {40},
  number = {13},
  pages = {5228 - 5239},
  year = {2013},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2013.03.008},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417413001589},
  author = {Juan D. Velásquez},
  keywords = {Web mining},
  keywords = {Privacy},
  keywords = {Regulation},
  keywords = {Personalization },
  abstract = {Abstract Web mining is a concept that gathers all techniques, methods and algorithms used to extract information and knowledge from data originating on the web (web data). A part of this technique aims to analyze the behavior of users in order to continuously improve both the structure and content of visited web sites. Behind this quite altruistic belief – namely, to help the user feel comfortable when they visit a site through a personalization process – there underlie a series of processing methodologies which operate at least arguably from the point of view of the users’ privacy. Thus, an important question arises; to what extent may the desire to improve the services offered through a web site infringe upon the privacy of those who visit it? The use of powerful processing tools such as those provided by web mining may threaten users’ privacy. Current legal scholarship on privacy issues suggests a flexible approach that enables the determination, within each particular context, of those behaviors that can threaten individual privacy. However, it has been observed that \{TIC\} professionals, with the purpose of formulating practical rules on this matter, have a very narrow-minded concept of privacy, primarily centered on the dichotomy between personal identifiable information (PII) and anonymous data. The aim of this paper is to adopt an integrative approach based on the distinctive attributes of web mining in order to determine which techniques and uses are harmful. }
}
@incollection{Liengme2016103,
  title = {Chapter 6 - Data Mining },
  editor = {Liengme, Bernard V. },
  booktitle = {A Guide to Microsoft Excel 2013 for Scientists and Engineers },
  publisher = {Academic Press},
  edition = {},
  address = {Boston},
  year = {2016},
  pages = {103 - 116},
  isbn = {978-0-12-802817-9},
  doi = {http://dx.doi.org/10.1016/B978-0-12-802817-9.00006-4},
  url = {http://www.sciencedirect.com/science/article/pii/B9780128028179000064},
  author = {Bernard V. Liengme},
  keywords = {Frequency distribution},
  keywords = {Pivot tables},
  keywords = {Sorting},
  keywords = {Filtering},
  keywords = {Tables},
  keywords = {Importing files },
  abstract = {Abstract The main thrust of this chapter is the consolidation of large databases into usable information. The chapter begins with instructions on how to import a \{TXT\} file as a way of giving the reader a reasonably large database with which to experiment. The reader is told how to sort and filter a database. Consolidation techniques such as frequency tables and pivot tables are explained. }
}
@incollection{Mack2014439,
  title = {Chapter 35 - Big Data, Data Mining, and Predictive Analytics and High Performance Computing },
  editor = {Jones, Lawrence E. },
  booktitle = {Renewable Energy Integration },
  publisher = {Academic Press},
  edition = {},
  address = {Boston},
  year = {2014},
  pages = {439 - 454},
  isbn = {978-0-12-407910-6},
  doi = {http://dx.doi.org/10.1016/B978-0-12-407910-6.00035-1},
  url = {http://www.sciencedirect.com/science/article/pii/B9780124079106000351},
  author = {Phillippe Mack},
  keywords = {predictive analytics},
  keywords = {smart grid},
  keywords = {big data},
  keywords = {high performance computing},
  keywords = {data mining},
  keywords = {uncertainties},
  keywords = {performance management },
  abstract = {Abstract Acceleration in connected devices, computing power and storage capacity has led to an exponential growth of available data. In recent years, all industry sectors have started to face this huge flow of information and tried to turn it into increased value for their business. Renewable energy sources are increasing at a lightning pace all around the world. Uncertainties related to renewable energy market penetration threaten the business of many stakeholders. Advances in predictive analytics have proven to be an efficient solution to cope with uncertainties and transform data into business value. High performance computing and cloud based computation significantly decrease the setup costs of predictive analytics solutions. With the emergence of smart grids, newer solutions based on predictive analytics are starting to appear in the market place, however, market players still face numerous roadblocks in order to leverage the full potential of big data. }
}
@article{Ríos2014730,
  title = {Generating Groups of Products Using Graph Mining Techniques },
  journal = {Procedia Computer Science },
  volume = {35},
  number = {0},
  pages = {730 - 738},
  year = {2014},
  note = {Knowledge-Based and Intelligent Information & Engineering Systems 18th Annual Conference, KES-2014 Gdynia, Poland, September 2014 Proceedings },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2014.08.155},
  url = {http://www.sciencedirect.com/science/article/pii/S187705091401120X},
  author = {Sebastián A. Ríos and Ivan F. Videla–Cavieres},
  keywords = {Market Basket Analysis},
  keywords = {Overlap Community Detection},
  keywords = {Big Data},
  keywords = {Graph Mining},
  keywords = {Transactional Data ; },
  abstract = {Abstract Retail industry has evolved. Nowadays, companies around the world need a better and deeper understanding of their customers. In order to enhance store layout, generate customers groups, offers and personalized recommendations, among others. To accomplish these objectives, it is very important to know which products are related to each other. Classical approaches for clustering products, such as K-means or SOFM, do not work when exist scattered and large amounts of data. Even association rules give results that are difficult to interpret. These facts motivate us to use a novel approach that generates communities of products. One of the main advantages of these communities is that are meaningful and easily interpretable by retail analysts. This approach allows the processing of billions of transaction records within a reasonable time, according to the needs of companies. }
}
@incollection{Menzies201541,
  title = {Chapter 7 - Data Mining and \{SE\} },
  editor = {Turhan, Tim MenziesEkrem KocagüneliLeandro MinkuFayola PetersBurak },
  booktitle = {Sharing Data and Models in Software Engineering },
  publisher = {Morgan Kaufmann},
  edition = {},
  address = {Boston},
  year = {2015},
  pages = {41 - 42},
  isbn = {978-0-12-417295-1},
  doi = {http://dx.doi.org/10.1016/B978-0-12-417295-1.00007-2},
  url = {http://www.sciencedirect.com/science/article/pii/B9780124172951000072},
  author = {Tim Menzies and Ekrem Kocagüneli and Leandro Minku and Fayola Peters and Burak Turhan},
  abstract = {Abstract In this part of the book Data Science for Software Engineering: Sharing Data and Models, we offer some tutorial notes on commonly used software engineering applications of data mining, along with some tutorial material on data mining algorithms. Covered topics of \{SE\} problems include effort estimation and defect prediction. Covered aspects of data mining include discretization, column pruning (also known as feature selection), row pruning, clustering, contrast set learning, decision learning, and learning for continuous classes. }
}
@article{Nettleton20131,
  title = {Data mining of social networks represented as graphs },
  journal = {Computer Science Review },
  volume = {7},
  number = {0},
  pages = {1 - 34},
  year = {2013},
  note = {},
  issn = {1574-0137},
  doi = {http://dx.doi.org/10.1016/j.cosrev.2012.12.001},
  url = {http://www.sciencedirect.com/science/article/pii/S1574013712000445},
  author = {David F. Nettleton},
  keywords = {Graphs},
  keywords = {Online social networks},
  keywords = {Graph mining},
  keywords = {Data mining},
  keywords = {Statistical analysis},
  keywords = {Data modelling },
  abstract = {In this survey we review the literature and concepts of the data mining of social networks, with special emphasis on their representation as a graph structure. The survey is divided into two principal parts: first we conduct a survey of the literature which forms the ‘basis’ and background for the field; second we define a set of ‘hot topics’ which are currently in vogue in congresses and the literature. The ‘basis’ or background part is divided into four major themes: graph theory, social networks, online social networks and graph mining. The graph mining theme is organized into ten subthemes. The second, ‘hot topic’ part, is divided into five major themes: communities, influence and recommendation, models metrics and dynamics, behaviour and relationships, and information diffusion. }
}
@article{Liu2013686,
  title = {Component analysis of Chinese medicine and advances in fuming-washing therapy for knee osteoarthritis via unsupervised data mining methods },
  journal = {Journal of Traditional Chinese Medicine },
  volume = {33},
  number = {5},
  pages = {686 - 691},
  year = {2013},
  note = {},
  issn = {0254-6272},
  doi = {http://dx.doi.org/10.1016/S0254-6272(14)60043-1},
  url = {http://www.sciencedirect.com/science/article/pii/S0254627214600431},
  author = {Jun Liu and Jianke Pan and Yanping Wang and Dingkun Lin and Dan Shen and Hongjun Yang and Xiang Li and Minghui Luo and Xuewei Cao},
  keywords = {Knee osteoarthritis},
  keywords = {Fuming-washing therapy},
  keywords = {Drug prescriptions},
  keywords = {Data mining },
  abstract = {AbstractObjective To analyze the component law of Chinese medicines in fuming-washing therapy for knee osteoarthritis (KOA), and develop new fuming-washing prescriptions for \{KOA\} through unsupervised data mining methods. Methods Chinese medicine recipes for fuming-washing therapy for \{KOA\} were collected and recorded in a database. The correlation coefficient among herbs, core combinations of herbs, and new prescriptions were analyzed using modified mutual information, complex system entropy cluster, and unsupervised hierarchical clustering, respectively. Results Based on analysis of 345 Chinese medicine recipes for fuming-washing therapy, 68 herbs occurred frequently, 33 herb pairs occurred frequently, and 12 core combinations were found. Five new fuming-washing recipes for \{KOA\} were developed. Conclusion Chinese medicines for fuming-washing therapy of \{KOA\} mainly consist of wind-dampness-dispelling and cold-dispersing herbs, blood-activating and stasis-resolving herbs, and wind-dampness-dispelling and heat-clearing herbs. The treatment of fuming-washing therapy for \{KOA\} also includes dispelling wind-dampness and dispersing cold, activating blood and resolving stasis, and dispelling wind-dampness and clearing heat. Zhenzhutougucao (Herba Speranskiae Tuberculatae), Honghua (Flos Carthami), Niuxi (Radix Achyranthis Bidentatae), Shenjincao (Herba Lycopodii Japonici), Weilingxian (Radix et Rhizoma Clematidis Chinensis), Chuanwu (Radix Aconiti), Haitongpi (Cortex Erythrinae Variegatae), Ruxiang (Olibanum), Danggui (Radix Angelicae Sinensis), Caowu (Radix Aconiti Kusnezoffii), Moyao (Myrrha), and Aiye (Folium Artemisiae Argyi) are the main herbs used in the fuming-washing treatment for KOA. }
}
@article{Tsai20128172,
  title = {Global data mining: An empirical study of current trends, future forecasts and technology diffusions },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {9},
  pages = {8172 - 8181},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.01.150},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412001704},
  author = {Hsu-Hao Tsai},
  keywords = {Data mining},
  keywords = {Research trends and forecasts},
  keywords = {Technology diffusions},
  keywords = {Bibliometric methodology },
  abstract = {Using a bibliometric approach, this paper analyzes research trends and forecasts of data mining from 1989 to 2009 by locating heading “data mining” in topic in the \{SSCI\} database. The bibliometric analytical technique was used to examine the topic in \{SSCI\} journals from 1989 to 2009, we found 1181 articles with data mining. This paper implemented and classified data mining articles using the following eight categories—publication year, citation, country/territory, document type, institute name, language, source title and subject area—for different distribution status in order to explore the differences and how data mining technologies have developed in this period and to analyze technology tendencies and forecasts of data mining under the above results. Also, the paper performs the K-S test to check whether the analysis follows Lotka’s law. Besides, the analysis also reviews the historical literatures to come out technology diffusions of data mining. The paper provides a roadmap for future research, abstracts technology trends and forecasts, and facilitates knowledge accumulation so that data mining researchers can save some time since core knowledge will be concentrated in core categories. This implies that the phenomenon “success breeds success” is more common in higher quality publications. }
}
@article{Thorleuchter20134978,
  title = {Weak signal identification with semantic web mining },
  journal = {Expert Systems with Applications },
  volume = {40},
  number = {12},
  pages = {4978 - 4985},
  year = {2013},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2013.03.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417413001528},
  author = {Dirk Thorleuchter and Dirk Van den Poel},
  keywords = {Weak Signal},
  keywords = {Ansoff},
  keywords = {Latent semantic indexing},
  keywords = {\{SVD\}},
  keywords = {Web mining },
  abstract = {Abstract We investigate an automated identification of weak signals according to Ansoff to improve strategic planning and technological forecasting. Literature shows that weak signals can be found in the organization’s environment and that they appear in different contexts. We use internet information to represent organization’s environment and we select these websites that are related to a given hypothesis. In contrast to related research, a methodology is provided that uses latent semantic indexing (LSI) for the identification of weak signals. This improves existing knowledge based approaches because \{LSI\} considers the aspects of meaning and thus, it is able to identify similar textual patterns in different contexts. A new weak signal maximization approach is introduced that replaces the commonly used prediction modeling approach in LSI. It enables to calculate the largest number of relevant weak signals represented by singular value decomposition (SVD) dimensions. A case study identifies and analyses weak signals to predict trends in the field of on-site medical oxygen production. This supports the planning of research and development (R&D) for a medical oxygen supplier. As a result, it is shown that the proposed methodology enables organizations to identify weak signals from the internet for a given hypothesis. This helps strategic planners to react ahead of time. }
}
@incollection{Menzies201551,
  title = {Chapter 10 - Data Mining (Under The Hood) },
  editor = {Turhan, Tim MenziesEkrem KocagüneliLeandro MinkuFayola PetersBurak },
  booktitle = {Sharing Data and Models in Software Engineering },
  publisher = {Morgan Kaufmann},
  edition = {},
  address = {Boston},
  year = {2015},
  pages = {51 - 75},
  isbn = {978-0-12-417295-1},
  doi = {http://dx.doi.org/10.1016/B978-0-12-417295-1.00010-2},
  url = {http://www.sciencedirect.com/science/article/pii/B9780124172951000102},
  author = {Tim Menzies and Ekrem Kocagüneli and Leandro Minku and Fayola Peters and Burak Turhan},
  abstract = {Abstract In this part of the book Data Science for Software Engineering: Sharing Data and Models, we offer some tutorial notes on commonly used software engineering applications of data mining, along with some tutorial material on data mining algorithms. Covered topics of \{SE\} problems include effort estimation and defect prediction. Covered aspects of data mining include discretization, column pruning (also known as feature selection), row pruning, clustering, contrast set learning, decision learning, and learning for continuous classes. }
}
@article{Mostafa20134241,
  title = {More than words: Social networks’ text mining for consumer brand sentiments },
  journal = {Expert Systems with Applications },
  volume = {40},
  number = {10},
  pages = {4241 - 4251},
  year = {2013},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2013.01.019},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417413000328},
  author = {Mohamed M. Mostafa},
  keywords = {Consumer behavior},
  keywords = {Global brands},
  keywords = {Sentiment analysis},
  keywords = {Text mining},
  keywords = {Twitter },
  abstract = {Blogs and social networks have recently become a valuable resource for mining sentiments in fields as diverse as customer relationship management, public opinion tracking and text filtering. In fact knowledge obtained from social networks such as Twitter and Facebook has been shown to be extremely valuable to marketing research companies, public opinion organizations and other text mining entities. However, Web texts have been classified as noisy as they represent considerable problems both at the lexical and the syntactic levels. In this research we used a random sample of 3516 tweets to evaluate consumers’ sentiment towards well-known brands such as Nokia, T-Mobile, IBM, \{KLM\} and DHL. We used an expert-predefined lexicon including around 6800 seed adjectives with known orientation to conduct the analysis. Our results indicate a generally positive consumer sentiment towards several famous brands. By using both a qualitative and quantitative methodology to analyze brands’ tweets, this study adds breadth and depth to the debate over attitudes towards cosmopolitan brands. }
}
@article{Zhu2013200,
  title = {Biomedical text mining and its applications in cancer research },
  journal = {Journal of Biomedical Informatics },
  volume = {46},
  number = {2},
  pages = {200 - 211},
  year = {2013},
  note = {},
  issn = {1532-0464},
  doi = {http://dx.doi.org/10.1016/j.jbi.2012.10.007},
  url = {http://www.sciencedirect.com/science/article/pii/S1532046412001712},
  author = {Fei Zhu and Preecha Patumcharoenpol and Cheng Zhang and Yang Yang and Jonathan Chan and Asawin Meechai and Wanwipa Vongsangnak and Bairong Shen},
  keywords = {Biomedical text},
  keywords = {Cancer},
  keywords = {Systems biology},
  keywords = {Text mining },
  abstract = {Cancer is a malignant disease that has caused millions of human deaths. Its study has a long history of well over 100 years. There have been an enormous number of publications on cancer research. This integrated but unstructured biomedical text is of great value for cancer diagnostics, treatment, and prevention. The immense body and rapid growth of biomedical text on cancer has led to the appearance of a large number of text mining techniques aimed at extracting novel knowledge from scientific text. Biomedical text mining on cancer research is computationally automatic and high-throughput in nature. However, it is error-prone due to the complexity of natural language processing. In this review, we introduce the basic concepts underlying text mining and examine some frequently used algorithms, tools, and data sets, as well as assessing how much these algorithms have been utilized. We then discuss the current state-of-the-art text mining applications in cancer research and we also provide some resources for cancer text mining. With the development of systems biology, researchers tend to understand complex biomedical systems from a systems biology viewpoint. Thus, the full utilization of text mining to facilitate cancer systems biology research is fast becoming a major concern. To address this issue, we describe the general workflow of text mining in cancer systems biology and each phase of the workflow. We hope that this review can (i) provide a useful overview of the current work of this field; (ii) help researchers to choose text mining tools and datasets; and (iii) highlight how to apply text mining to assist cancer systems biology research. }
}
@article{Liao20131542,
  title = {Data mining investigation of co-movements on the Taiwan and China stock markets for future investment portfolio },
  journal = {Expert Systems with Applications },
  volume = {40},
  number = {5},
  pages = {1542 - 1554},
  year = {2013},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.08.075},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412010512},
  author = {Shu-Hsien Liao and Shan-Yuan Chou},
  keywords = {Cross-national stock market},
  keywords = {Stock market investment portfolio},
  keywords = {Co-movements},
  keywords = {Data mining},
  keywords = {Association rules},
  keywords = {Cluster analysis },
  abstract = {On June 29, 2010, Taiwan signed an Economic Cooperation Framework Agreement (ECFA) with China as a major step to open markets between Taiwan and China. Thus, the \{ECFA\} will contribute by creating a closer relationship between China and Taiwan through economic and market interactions. Co-movements of the world’s national financial market indexes are a popular research topic in the finance literature. Some studies examine the co-movements and the benefits of international financial market portfolio diversification/integration and economic performance. Thus, this study investigates the co-movement in the Taiwan and China (Hong Kong) stock markets under the \{ECFA\} using a data mining approach, including association rules and clustering analysis. Thirty categories of stock indexes are implemented as decision variables to observe the behavior of stock index associations during the periods of \{ECFA\} implementation. Patterns, rules, and clusters of data mining results are discussed for future stock market investment portfolio. }
}
@article{Soares20131451,
  title = {Discovering collaborative knowledge-intensive processes through e-mail mining },
  journal = {Journal of Network and Computer Applications },
  volume = {36},
  number = {6},
  pages = {1451 - 1465},
  year = {2013},
  note = {},
  issn = {1084-8045},
  doi = {http://dx.doi.org/10.1016/j.jnca.2013.02.007},
  url = {http://www.sciencedirect.com/science/article/pii/S1084804513000453},
  author = {Diego Carvalho Soares and Flávia Maria Santoro and Fernanda Araujo Baião},
  keywords = {Knowledge-intensive process},
  keywords = {Process mining},
  keywords = {Natural language processing},
  keywords = {Collaborative environment },
  abstract = {Knowledge Management aims to promote the growth, communication and preservation of knowledge within an organization, which includes managing the appropriate resources to facilitate knowledge sharing and reuse. Business Process-Oriented Knowledge Management focuses on discovering and representing the dynamic conversion of existing knowledge among participants involved in executing business processes. In this context, Knowledge-Intensive Processes are a very important and challenging specific subclass of processes, since they strongly involve socialization and informal exchanges of knowledge among participants. This paper describes in detail a method for semi-automatic discovery of relevant information characterizing Knowledge-Intensive Processes, as well as the results of further evaluation of the method. Our approach draws on the informal exchange of existing knowledge in collaborative tools such as e-mails. The output is a conceptual map that describes the main elements of a Knowledge-Intensive Process, as well as their relationships. The results from the case study conducted to evaluate the method in an organization underlined the prospects for using collaborative environments to discover the way agents perform their activities. }
}
@article{Duan2013425,
  title = {Mining effective multi-segment sliding window for pathogen incidence rate prediction },
  journal = {Data & Knowledge Engineering },
  volume = {87},
  number = {0},
  pages = {425 - 444},
  year = {2013},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2013.05.006},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X13000517},
  author = {Lei Duan and Changjie Tang and Xiaosong Li and Guozhu Dong and Xianming Wang and Jie Zuo and Min Jiang and Zhongqi Li and Yongqing Zhang},
  keywords = {Data mining},
  keywords = {Time series modeling},
  keywords = {Multi-segment sliding window},
  keywords = {Pathogen incidence rate prediction },
  abstract = {Abstract Pathogen incidence rate prediction, which can be considered as time series modeling, is an important task for infectious disease incidence rate prediction and for public health. This paper investigates the application of a genetic computation technique, namely GEP, for pathogen incidence rate prediction. To overcome the shortcomings of traditional sliding windows in GEP-based time series modeling, the paper introduces the problem of mining effective sliding window, for discovering optimal sliding windows for building accurate prediction models. To utilize the periodical characteristic of pathogen incidence rates, a multi-segment sliding window consisting of several segments from different periodical intervals is proposed and used. Since the number of such candidate windows is still very large, a heuristic method is designed for enumerating the candidate effective multi-segment sliding windows. Moreover, methods to find the optimal sliding window and then produce a mathematical model based on that window are proposed. A performance study on real-world datasets shows that the techniques are effective and efficient for pathogen incidence rate prediction. }
}
@article{Lin20127173,
  title = {An incremental mining algorithm for high utility itemsets },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {8},
  pages = {7173 - 7180},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.01.072},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412000851},
  author = {Chun-Wei Lin and Guo-Cheng Lan and Tzung-Pei Hong},
  keywords = {Utility mining},
  keywords = {High utility itemset},
  keywords = {Incremental mining},
  keywords = {\{FUP\} concept},
  keywords = {Two-phase algorithm },
  abstract = {Association-rule mining, which is based on frequency values of items, is the most common topic in data mining. In real-world applications, customers may, however, buy many copies of products and each product may have different factors, such as profits and prices. Only mining frequent itemsets in binary databases is thus not suitable for some applications. Utility mining is thus presented to consider additional measures, such as profits or costs according to user preference. In the past, a two-phase mining algorithm was designed for fast discovering high utility itemsets from databases. When data come intermittently, the approach needs to process all the transactions in a batch way. In this paper, an incremental mining algorithm for efficiently mining high utility itemsets is proposed to handle the above situation. It is based on the concept of the fast-update (FUP) approach, which was originally designed for association mining. The proposed approach first partitions itemsets into four parts according to whether they are high transaction-weighted utilization itemsets in the original database and in the newly inserted transactions. Each part is then executed by its own procedure. Experimental results also show that the proposed algorithm executes faster than the two-phase batch mining algorithm in the intermittent data environment }
}
@article{Sael201582,
  title = {Scalable Tensor Mining },
  journal = {Big Data Research },
  volume = {2},
  number = {2},
  pages = {82 - 86},
  year = {2015},
  note = {Visions on Big Data },
  issn = {2214-5796},
  doi = {http://dx.doi.org/10.1016/j.bdr.2015.01.004},
  url = {http://www.sciencedirect.com/science/article/pii/S2214579615000052},
  author = {Lee Sael and Inah Jeon and U Kang},
  keywords = {Tensor},
  keywords = {Distributed computing},
  keywords = {Big data },
  abstract = {Abstract Tensors, or multi dimensional arrays, are receiving significant attention due to the various types of data that can be modeled by them; examples include call graphs (sender, receiver, time), knowledge bases (subject, verb, object), 3-dimensional web graphs augmented with anchor texts, to name a few. Scalable tensor mining aims to extract important patterns and anomalies from a large amount of tensor data. In this paper, we provide an overview of scalable tensor mining. We first present main algorithms for tensor mining, and their scalable versions. Next, we describe success stories of using tensors for interesting data mining problems including higher order web analysis, knowledge base mining, network traffic analysis, citation analysis, and sensor data analysis. Finally, we discuss interesting future research directions for scalable tensor mining. }
}
@article{Lo2013773,
  title = {Mining direct antagonistic communities in signed social networks },
  journal = {Information Processing & Management },
  volume = {49},
  number = {4},
  pages = {773 - 791},
  year = {2013},
  note = {},
  issn = {0306-4573},
  doi = {http://dx.doi.org/10.1016/j.ipm.2012.12.009},
  url = {http://www.sciencedirect.com/science/article/pii/S0306457313000058},
  author = {David Lo and Didi Surian and Philips Kokoh Prasetyo and Kuan Zhang and Ee-Peng Lim},
  keywords = {Direct antagonistic community},
  keywords = {Mining maximal bi-cliques},
  keywords = {Signed social network },
  abstract = {Abstract Social networks provide a wealth of data to study relationship dynamics among people. Most social networks such as Epinions and Facebook allow users to declare trusts or friendships with other users. Some of them also allow users to declare distrusts or negative relationships. When both positive and negative links co-exist in a network, some interesting community structures can be studied. In this work, we mine Direct Antagonistic Communities (DACs) within such signed networks. Each \{DAC\} consists of two sub-communities with positive relationships among members of each sub-community, and negative relationships among members of the other sub-community. Identifying direct antagonistic communities is an important step to understand the nature of the formation, dissolution, and evolution of such communities. Knowledge about antagonistic communities allows us to better understand and explain behaviors of users in the communities. Identifying \{DACs\} from a large signed network is however challenging as various combinations of user sets, which is very large in number, need to be checked. We propose an efficient data mining solution that leverages the properties of DACs, and combines the identification of strongest connected components and bi-clique mining. We have experimented our approach on synthetic, myGamma, and Epinions datasets to showcase the efficiency and utility of our proposed approach. We show that we can mine \{DACs\} in less than 15 min from a signed network of myGamma, which is a mobile social networking site, consisting of 600,000 members and 8 million links. An investigation on the behavior of users participating in \{DACs\} shows that antagonism significantly affects the way people behave and interact with one another. }
}
@article{MottaCabrera2013210,
  title = {Data association mining for identifying lighting energy waste patterns in educational institutes },
  journal = {Energy and Buildings },
  volume = {62},
  number = {0},
  pages = {210 - 216},
  year = {2013},
  note = {},
  issn = {0378-7788},
  doi = {http://dx.doi.org/10.1016/j.enbuild.2013.02.049},
  url = {http://www.sciencedirect.com/science/article/pii/S0378778813001436},
  author = {David F. Motta Cabrera and Hamidreza Zareipour},
  keywords = {Building energy use},
  keywords = {Energy efficiency},
  keywords = {Lighting},
  keywords = {Data association mining },
  abstract = {A significant portion of the energy consumption in post-secondary educational institutes is for lighting classrooms. The occupancy patterns in post-secondary educational institutes are not stable and predictable, and thus, alternative solutions may be required to match energy consumption and occupancy in order to increase energy efficiency. In this paper, we report an experimental research on quantifying and understanding lighting energy waste patterns in a post-secondary educational institute. Data has been collected over a full academic year in three typical classrooms. Data association mining, a powerful data mining tool, is applied to the data in order to extract association rules and explore lighting waste patterns. The simulations results show that if the waste patterns are avoided, significant savings, as high as 70% of the current energy use, are achievable. }
}
@article{Lee2013493,
  title = {Results on mining \{NHANES\} data: A case study in evidence-based medicine },
  journal = {Computers in Biology and Medicine },
  volume = {43},
  number = {5},
  pages = {493 - 503},
  year = {2013},
  note = {},
  issn = {0010-4825},
  doi = {http://dx.doi.org/10.1016/j.compbiomed.2013.02.018},
  url = {http://www.sciencedirect.com/science/article/pii/S0010482513000693},
  author = {Jun won Lee and Christophe Giraud-Carrier},
  keywords = {Medical data mining},
  keywords = {Observational study},
  keywords = {Evidence-based medicine},
  keywords = {\{NHANES\} },
  abstract = {The National Health and Nutrition Examination Survey (NHANES), administered annually by the National Center for Health Statistics, is designed to assess the general health and nutritional status of adults and children in the United States. Given to several thousands of individuals, the extent of this survey is very broad, covering demographic, laboratory and examination information, as well as responses to a fairly comprehensive health questionnaire. In this paper, we adapt and extend association rule mining and clustering algorithms to extract useful knowledge regarding diabetes and high blood pressure from the 1999–2008 survey results, thus demonstrating how data mining techniques may be used to support evidence-based medicine. }
}
@article{Yoon201212543,
  title = {Detecting weak signals for long-term business opportunities using text mining of Web news },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {16},
  pages = {12543 - 12550},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.04.059},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412006562},
  author = {Janghyeok Yoon},
  keywords = {Weak signal},
  keywords = {Future sign},
  keywords = {Text mining},
  keywords = {Web news},
  keywords = {Peripheral vision},
  keywords = {Business intelligence },
  abstract = {In an uncertain business environment, competitive intelligence requires peripheral vision to scan and identify weak signals that can affect the future business environment. Weak signals are defined as imprecise and early indicators of impending important events or trends, which are considered key to formulating new potential business items. However, existing methods for discovering weak signals rely on the knowledge and expertise of experts, whose services are not widely available and tend to be costly. They may even provide different analysis results. Therefore, this paper presents a quantitative method that identifies weak signal topics by exploiting keyword-based text mining. The proposed method is illustrated using Web news articles related to solar cells. As a supportive tool for the expert-based approach, this method can be incorporated into long-term business planning processes to assist experts in identifying potential business items. }
}
@article{Caron2013464,
  title = {A comprehensive investigation of the applicability of process mining techniques for enterprise risk management },
  journal = {Computers in Industry },
  volume = {64},
  number = {4},
  pages = {464 - 475},
  year = {2013},
  note = {},
  issn = {0166-3615},
  doi = {http://dx.doi.org/10.1016/j.compind.2013.02.001},
  url = {http://www.sciencedirect.com/science/article/pii/S016636151300016X},
  author = {Filip Caron and Jan Vanthienen and Bart Baesens},
  keywords = {Enterprise risk management},
  keywords = {Process mining},
  keywords = {Business process analytics},
  keywords = {Business rules},
  keywords = {Process-aware information systems },
  abstract = {Process mining techniques and tools perfectly complement the existing set of enterprise risk management approaches. Enterprise risk management aims at minimizing the negative effects of uncertainty on the objectives, while at the same time promoting the potential positive effects. Process mining research has proposed a broad range of techniques and tools that could be used to effectively support the activities related to the different phases of risk management. This paper contributes to the process mining and risk management research by providing a full exploration of the applicability of process mining in the context of the eight components of the \{COSO\} Enterprise Risk Management Framework. The identified applications will be illustrated based on the risks involved in insurance claim handling processes. }
}
@article{He2013464,
  title = {Social media competitive analysis and text mining: A case study in the pizza industry },
  journal = {International Journal of Information Management },
  volume = {33},
  number = {3},
  pages = {464 - 472},
  year = {2013},
  note = {},
  issn = {0268-4012},
  doi = {http://dx.doi.org/10.1016/j.ijinfomgt.2013.01.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0268401213000030},
  author = {Wu He and Shenghua Zha and Ling Li},
  keywords = {Social media},
  keywords = {Facebook},
  keywords = {Twitter},
  keywords = {Case study},
  keywords = {Pizza industry},
  keywords = {Competitive analysis},
  keywords = {Competitive intelligence},
  keywords = {Competitor intelligence},
  keywords = {Actionable intelligence},
  keywords = {Text mining},
  keywords = {Content analysis },
  abstract = {Social media have been adopted by many businesses. More and more companies are using social media tools such as Facebook and Twitter to provide various services and interact with customers. As a result, a large amount of user-generated content is freely available on social media sites. To increase competitive advantage and effectively assess the competitive environment of businesses, companies need to monitor and analyze not only the customer-generated content on their own social media sites, but also the textual information on their competitors’ social media sites. In an effort to help companies understand how to perform a social media competitive analysis and transform social media data into knowledge for decision makers and e-marketers, this paper describes an in-depth case study which applies text mining to analyze unstructured text content on Facebook and Twitter sites of the three largest pizza chains: Pizza Hut, Domino's Pizza and Papa John's Pizza. The results reveal the value of social media competitive analysis and the power of text mining as an effective technique to extract business value from the vast amount of available social media data. Recommendations are also provided to help companies develop their social media competitive analysis strategy. }
}
@article{Ferrari2013516,
  title = {Classification and prediction of whereabouts patterns from the Reality Mining dataset },
  journal = {Pervasive and Mobile Computing },
  volume = {9},
  number = {4},
  pages = {516 - 527},
  year = {2013},
  note = {},
  issn = {1574-1192},
  doi = {http://dx.doi.org/10.1016/j.pmcj.2012.04.002},
  url = {http://www.sciencedirect.com/science/article/pii/S1574119212000491},
  author = {Laura Ferrari and Marco Mamei},
  keywords = {Whereabouts data},
  keywords = {Human mobility},
  keywords = {\{LDA\} topic extraction},
  keywords = {Topic classification},
  keywords = {Topic prediction },
  abstract = {Classification and prediction of users’ whereabouts patterns is important for many emerging ubiquitous computing applications. Latent Dirichlet Allocation (LDA) is a powerful mechanism to extract recurrent behaviors and high-level patterns (called topics) from mobility data in an unsupervised manner. One drawback of \{LDA\} is that it is difficult to give meaningful and usable labels to the extracted topics. We present a methodology to automatically classify the topic with meaningful labels so as to support their use in applications. We also present a topic prediction mechanism to infer user’s future whereabouts on the basis of the extracted topics. Both these two mechanisms are tested and evaluated using the Reality Mining dataset consisting of a large set of continuous data on human behavior. }
}
@article{Huang20132998,
  title = {Decentralized mining social network communities with agents },
  journal = {Mathematical and Computer Modelling },
  volume = {57},
  number = {11–12},
  pages = {2998 - 3008},
  year = {2013},
  note = {Information System Security and Performance Modeling and Simulation for Future Mobile Networks },
  issn = {0895-7177},
  doi = {http://dx.doi.org/10.1016/j.mcm.2013.03.005},
  url = {http://www.sciencedirect.com/science/article/pii/S0895717713000794},
  author = {Jing Huang and Bo Yang and Di Jin and Yi Yang},
  keywords = {Social network},
  keywords = {Community mining},
  keywords = {Multi-agent system},
  keywords = {Decentralized algorithm },
  abstract = {Abstract Network community mining algorithms aim at efficiently and effectively discovering all such communities from a given network. Many related methods have been proposed and applied to different areas including social network analysis, gene network analysis and web clustering engines. Most of the existing methods for mining communities are centralized. In this paper, we present a multi-agent based decentralized algorithm, in which a group of autonomous agents work together to mine a network through a proposed self-aggregation and self-organization mechanism. Thanks to its decentralized feature, our method is potentially suitable for dealing with distributed networks, whose global structures are hard to obtain due to their geographical distributions, decentralized controls or huge sizes. The effectiveness of our method has been tested against different benchmark networks. }
}
@article{Chen20132746,
  title = {Constructing concept maps for adaptive learning systems based on data mining techniques },
  journal = {Expert Systems with Applications },
  volume = {40},
  number = {7},
  pages = {2746 - 2755},
  year = {2013},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.11.018},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412012286},
  author = {Shyi-Ming Chen and Po-Jui Sue},
  keywords = {Adaptive learning systems},
  keywords = {Association rules},
  keywords = {Concept maps},
  keywords = {Data mining},
  keywords = {Questions-relationship map },
  abstract = {In this paper, we propose a new method for automatically constructing concepts maps for adaptive learning systems based on data mining techniques. First, we calculate the counter values between any two questions, where the counter values indicate the answer-consistence between any two questions. Then, we consider four kinds of association rules between two questions to mine some information. Finally, we calculate the relevance degree between two concepts derived from the association rule to construct concept maps for adaptive learning systems. The proposed method can overcome the drawbacks of Chen and Bai’s (2010) and Lee et al.’s method (2009). It provides us with a useful way to construct concept maps for adaptive learning systems based on data mining techniques. }
}
@article{Shelokar2013118,
  title = {A multiobjective evolutionary programming framework for graph-based data mining },
  journal = {Information Sciences },
  volume = {237},
  number = {0},
  pages = {118 - 136},
  year = {2013},
  note = {Prediction, Control and Diagnosis using Advanced Neural Computations },
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2013.02.014},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025513001230},
  author = {Prakash Shelokar and Arnaud Quirin and Óscar Cordón},
  keywords = {Graph-based data mining},
  keywords = {Frequent subgraph mining},
  keywords = {Evolutionary multiobjective optimization},
  keywords = {Pareto optimality},
  keywords = {Multiobjective graph-based data mining},
  keywords = {Multiobjective evolutionary programming },
  abstract = {Abstract Subgraph mining is the process of identifying concepts describing interesting and repetitive subgraphs within graph-based data. The exponential number of possible subgraphs makes the problem very challenging. Existing methods apply a single-objective subgraph search with the view that interesting subgraphs are those capable of not merely compressing the data, but also enhancing the interpretation of the data considerably. Usually the methods operate by posing simple constraints (or user-defined thresholds) such as returning all subgraphs whose frequency is above a specified threshold. Such search approach may often return either a large number of solutions in the case of a weakly defined objective or very few in the case of a very strictly defined objective. In this paper, we propose a framework based on multiobjective evolutionary programming to mine subgraphs by jointly maximizing two objectives, support and size of the extracted subgraphs. The proposed methodology is able to discover a nondominated set of interesting subgraphs subject to tradeoff between the two objectives, which otherwise would not be achieved by the single-objective search. Besides, it can use different specific multiobjective evolutionary programming methods. Experimental results obtained by three of the latter methods on synthetically generated as well as real-life graph-based datasets validate the utility of the proposed methodology when benchmarked against classical single-objective methods and their previous, nonevolutionary multiobjective extensions. }
}
@article{Ikeda201335,
  title = {Twitter user profiling based on text and community mining for market analysis },
  journal = {Knowledge-Based Systems },
  volume = {51},
  number = {0},
  pages = {35 - 47},
  year = {2013},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2013.06.020},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705113002025},
  author = {Kazushi Ikeda and Gen Hattori and Chihiro Ono and Hideki Asoh and Teruo Higashino},
  keywords = {Web mining},
  keywords = {Market analysis},
  keywords = {User profiling},
  keywords = {Twitter},
  keywords = {Text analysis},
  keywords = {Community analysis},
  keywords = {Machine learning },
  abstract = {Abstract This paper proposes demographic estimation algorithms for profiling Twitter users, based on their tweets and community relationships. Many people post their opinions via social media services such as Twitter. This huge volume of opinions, expressed in real time, has great appeal as a novel marketing application. When automatically extracting these opinions, it is desirable to be able to discriminate discrimination based on user demographics, because the ratio of positive and negative opinions differs depending on demographics such as age, gender, and residence area, all of which are essential for market analysis. In this paper, we propose a hybrid text-based and community-based method for the demographic estimation of Twitter users, where these demographics are estimated by tracking the tweet history and clustering of followers/followees. Our experimental results from 100,000 Twitter users show that the proposed hybrid method improves the accuracy of the text-based method. The proposed method is applicable to various user demographics and is suitable even for users who only tweet infrequently. }
}
@article{Oberreuter20133756,
  title = {Text mining applied to plagiarism detection: The use of words for detecting deviations in the writing style },
  journal = {Expert Systems with Applications },
  volume = {40},
  number = {9},
  pages = {3756 - 3763},
  year = {2013},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.12.082},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412013231},
  author = {Gabriel Oberreuter and Juan D. Velásquez},
  keywords = {Text mining},
  keywords = {Text classification},
  keywords = {Plagiarism},
  keywords = {Copy detection},
  keywords = {Intrinsic plagiarism detection },
  abstract = {Plagiarism detection is of special interest to educational institutions, and with the proliferation of digital documents on the Web the use of computational systems for such a task has become important. While traditional methods for automatic detection of plagiarism compute the similarity measures on a document-to-document basis, this is not always possible since the potential source documents are not always available. We do text mining, exploring the use of words as a linguistic feature for analyzing a document by modeling the writing style present in it. The main goal is to discover deviations in the style, looking for segments of the document that could have been written by another person. This can be considered as a classification problem using self-based information where paragraphs with significant deviations in style are treated as outliers. This so-called intrinsic plagiarism detection approach does not need comparison against possible sources at all, and our model relies only on the use of words, so it is not language specific. We demonstrate that this feature shows promise in this area, achieving reasonable results compared to benchmark models. }
}
@article{Liu2008700,
  title = {Knowledge maps for composite e-services: A mining-based system platform coupling with recommendations },
  journal = {Expert Systems with Applications },
  volume = {34},
  number = {1},
  pages = {700 - 716},
  year = {2008},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2006.10.005},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417406003216},
  author = {Duen-Ren Liu and Chih-Kun Ke and Jia-Yuan Lee and Chun-Feng Lee},
  keywords = {Composite e-service},
  keywords = {Knowledge maps},
  keywords = {Topic maps},
  keywords = {Data mining},
  keywords = {Recommendation },
  abstract = {Providing various e-services on the Internet by enterprises is an important trend in e-business. Composite e-services, which consist of various e-services provided by different e-service providers, are complex processes that require the cooperation among cross-organizational e-service providers. The flexibility and success of e-business depend on effective knowledge support to access related information resources of composite e-services. Thus, providing effective knowledge support for accessing composite e-services is a challenging task. This work proposes a knowledge map platform to provide an effective knowledge support for utilizing composite e-services. A data mining approach is applied to extract knowledge patterns from the usage records of composite e-services. Based on the mining result, topic maps are employed to construct the knowledge map. Meanwhile, the proposed knowledge map is integrated with recommendation capability to generate recommendations for composite e-services via data mining and collaborative filtering techniques. A prototype system is implemented to demonstrate the proposed platform. The proposed knowledge map enhanced with recommendation capability can provide users customized decision support to effectively utilize composite e-services. }
}
@article{KamsuFoguem20131034,
  title = {Mining association rules for the quality improvement of the production process },
  journal = {Expert Systems with Applications },
  volume = {40},
  number = {4},
  pages = {1034 - 1045},
  year = {2013},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.08.039},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412010007},
  author = {Bernard Kamsu-Foguem and Fabien Rigal and Félix Mauget},
  keywords = {Data mining},
  keywords = {Association rule mining},
  keywords = {Knowledge discovery},
  keywords = {Continuous improvement},
  keywords = {Drilling product manufacturing},
  keywords = {Industrial maintenance },
  abstract = {Academics and practitioners have a common interest in the continuing development of methods and computer applications that support or perform knowledge-intensive engineering tasks. Operations management dysfunctions and lost production time are problems of enormous magnitude that impact the performance and quality of industrial systems as well as their cost of production. Association rule mining is a data mining technique used to find out useful and invaluable information from huge databases. This work develops a better conceptual base for improving the application of association rule mining methods to extract knowledge on operations and information management. The emphasis of the paper is on the improvement of the operations processes. The application example details an industrial experiment in which association rule mining is used to analyze the manufacturing process of a fully integrated provider of drilling products. The study reports some new interesting results with data mining and knowledge discovery techniques applied to a drill production process. Experiment’s results on real-life data sets show that the proposed approach is useful in finding effective knowledge associated to dysfunctions causes. }
}
@article{Rajpathak2013565,
  title = {An ontology based text mining system for knowledge discovery from the diagnosis data in the automotive domain },
  journal = {Computers in Industry },
  volume = {64},
  number = {5},
  pages = {565 - 580},
  year = {2013},
  note = {},
  issn = {0166-3615},
  doi = {http://dx.doi.org/10.1016/j.compind.2013.03.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0166361513000456},
  author = {Dnyanesh G. Rajpathak},
  keywords = {Text mining},
  keywords = {Clustering},
  keywords = {Fault diagnosis},
  keywords = {Ontology},
  keywords = {Automotive },
  abstract = {Abstract In automotive domain, overwhelming volume of textual data is recorded in the form of repair verbatim collected during the fault diagnosis (FD) process. Here, the aim of knowledge discovery using text mining (KDT) task is to discover the best-practice repair knowledge from millions of repair verbatim enabling accurate FD. However, the complexity of \{KDT\} problem is largely due to the fact that a significant amount of relevant knowledge is buried in noisy and unstructured verbatim. In this paper, we propose a novel ontology-based text mining system, which uses the diagnosis ontology for annotating key terms recorded in the repair verbatim. The annotated terms are extracted in different tuples, which are used to identify the field anomalies. The extracted tuples are further used by the frequently co-occurring clustering algorithm to cluster the repair verbatim data such that the best-practice repair actions used to fix commonly observed symptoms associated with the faulty parts can be discovered. The performance of our system has been validated by using the real world data and it has been successfully implemented in a web based distributed architecture in real life industry. }
}
@incollection{Coomans2009559,
  title = {2.26 - Unsupervised Data Mining: Introduction },
  editor = {Walczak, Steven D. BrownRomá TaulerBeata },
  booktitle = {Comprehensive Chemometrics },
  publisher = {Elsevier},
  edition = {},
  address = {Oxford},
  year = {2009},
  pages = {559 - 576},
  isbn = {978-0-444-52701-1},
  doi = {http://dx.doi.org/10.1016/B978-044452701-1.00063-6},
  url = {http://www.sciencedirect.com/science/article/pii/B9780444527011000636},
  author = {D. Coomans and C. Smyth and I. Lee and T. Hancock and J. Yang},
  keywords = {cluster analysis},
  keywords = {cluster validity},
  keywords = {data mining},
  keywords = {proximities },
  abstract = {This chapter focuses on cluster analysis in the context of unsupervised data mining. Various facets of cluster analysis, including proximities, are discussed in detail. Techniques of determining the natural number of clusters are described. Finally, techniques of assessing cluster accuracy and reproducibility are detailed. Techniques mentioned in this chapter are expanded upon in the following chapters. }
}
@article{Baumgrass2013148,
  title = {Bridging the gap between role mining and role engineering via migration guides },
  journal = {Information Security Technical Report },
  volume = {17},
  number = {4},
  pages = {148 - 172},
  year = {2013},
  note = {Special Issue: \{ARES\} 2012 7th International Conference on Availability, Reliability and Security },
  issn = {1363-4127},
  doi = {http://dx.doi.org/10.1016/j.istr.2013.03.003},
  url = {http://www.sciencedirect.com/science/article/pii/S1363412713000198},
  author = {Anne Baumgrass and Mark Strembeck},
  keywords = {\{RBAC\}},
  keywords = {Migration},
  keywords = {Model comparison},
  keywords = {Role engineering},
  keywords = {Role mining },
  abstract = {Abstract In the context of role-based access control (RBAC), mining approaches, such as role mining or organizational mining, can be applied to derive permissions and roles from a system's configuration or from log files. In this way, mining techniques document the current state of a system and produce current-state \{RBAC\} models. However, such current-state \{RBAC\} models most often follow from structures that have evolved over time and are not the result of a systematic rights management procedure. In contrast, role engineering is applied to define a tailored \{RBAC\} model for a particular organization or information system. Thus, role engineering techniques produce a target-state \{RBAC\} model that is customized for the business processes supported via the respective information system. The migration from a current-state \{RBAC\} model to a tailored target-state \{RBAC\} model is, however, a complex task. In this paper, we present a systematic approach to migrate current-state \{RBAC\} models to target-state \{RBAC\} models. In particular, we use model comparison techniques to identify differences between two \{RBAC\} models. Based on these differences, we derive migration rules that define which elements and element relations must be changed, added, or removed. A migration guide then includes all migration rules that need to be applied to a particular current-state \{RBAC\} model to produce the corresponding target-state \{RBAC\} model. We conducted two comparative studies to identify which visualization technique is most suitable to make migration guides available to human users. Based on the results of these comparative studies, we implemented tool support for the derivation and visualization of migration guides. Our software tool is based on the Eclipse Modeling Framework (EMF). Moreover, this paper describes the experimental evaluation of our tool. }
}
@article{Basari2013453,
  title = {Opinion Mining of Movie Review using Hybrid Method of Support Vector Machine and Particle Swarm Optimization },
  journal = {Procedia Engineering },
  volume = {53},
  number = {0},
  pages = {453 - 462},
  year = {2013},
  note = {Malaysian Technical Universities Conference on Engineering & Technology 2012, \{MUCET\} 2012 },
  issn = {1877-7058},
  doi = {http://dx.doi.org/10.1016/j.proeng.2013.02.059},
  url = {http://www.sciencedirect.com/science/article/pii/S1877705813001781},
  author = {Abd. Samad Hasan Basari and Burairah Hussin and I. Gede Pramudya Ananta and Junta Zeniarja},
  keywords = {Opinion},
  keywords = {Opinion mining},
  keywords = {Sentiment},
  keywords = {Sentiment analysis},
  keywords = {\{SVM\}},
  keywords = {SVM-PSO },
  abstract = {Nowadays, online social media is online discourse where people contribute to create content, share it, bookmark it, and network at an impressive rate. The faster message and ease of use in social media today is Twitter. The messages on Twitter include reviews and opinions on certain topics such as movie, book, product, politic, and so on. Based on this condition, this research attempts to use the messages of twitter to review a movie by using opinion mining or sentiment analysis. Opinion mining refers to the application of natural language processing, computational linguistics, and text mining to identify or classify whether the movie is good or not based on message opinion. Support Vector Machine (SVM) is supervised learning methods that analyze data and recognize the patterns that are used for classification. This research concerns on binary classification which is classified into two classes. Those classes are positive and negative. The positive class shows good message opinion; otherwise the negative class shows the bad message opinion of certain movies. This justification is based on the accuracy level of \{SVM\} with the validation process uses 10-Fold cross validation and confusion matrix. The hybrid Partical Swarm Optimization (PSO) is used to improve the election of best parameter in order to solve the dual optimization problem. The result shows the improvement of accuracy level from 71.87% to 77%. }
}
@article{Hachana2013131,
  title = {Semantic analysis of role mining results and shadowed roles detection },
  journal = {Information Security Technical Report },
  volume = {17},
  number = {4},
  pages = {131 - 147},
  year = {2013},
  note = {Special Issue: \{ARES\} 2012 7th International Conference on Availability, Reliability and Security },
  issn = {1363-4127},
  doi = {http://dx.doi.org/10.1016/j.istr.2013.03.001},
  url = {http://www.sciencedirect.com/science/article/pii/S1363412713000174},
  author = {Safaà Hachana and Frédéric Cuppens and Nora Cuppens-Boulahia and Joaquin Garcia-Alfaro},
  keywords = {Access control},
  keywords = {Role mining},
  keywords = {\{IT\} security},
  keywords = {Boolean logic},
  keywords = {Role set comparison},
  keywords = {Shadowed roles },
  abstract = {Abstract The use of role engineering has grown in importance with the expansion of highly abstracted access control frameworks in organizations. In particular, the use of role mining techniques for the discovery of roles from previously deployed authorizations has facilitated the configuration of such frameworks. However, the literature lacks from a clear basis for appraising and leveraging the learning outcomes of the role mining process. In this paper, we provide such a formal basis. We compare sets of roles by projecting roles from one set into the other set. This approach is useful to measure how comparable the two configurations of roles are, and to interpret each role. We formally define the problem of comparing sets of roles, and prove that the problem is NP-complete. Then, we propose an algorithm to map the inherent relationship between the sets based on Boolean expressions. We demonstrate the correctness and completeness of our solution, and investigate some further issues that may benefit from our approach, such as detection of unhandled perturbations or source misconfiguration. In particular, we emphasize that the presence of shadowed roles in the role configuration increases the time complexity of sets of roles comparison. We provide a definition of the shadowed roles problem and propose a solution that detects different cases of role shadowing. }
}
@article{He201390,
  title = {Examining students’ online interaction in a live video streaming environment using data mining and text mining },
  journal = {Computers in Human Behavior },
  volume = {29},
  number = {1},
  pages = {90 - 102},
  year = {2013},
  note = {Including Special Section Youth, Internet, and Wellbeing },
  issn = {0747-5632},
  doi = {http://dx.doi.org/10.1016/j.chb.2012.07.020},
  url = {http://www.sciencedirect.com/science/article/pii/S0747563212002233},
  author = {Wu He},
  keywords = {Educational data mining},
  keywords = {Text mining},
  keywords = {Live video streaming},
  keywords = {Clustering analysis},
  keywords = {Online interaction},
  keywords = {Social interaction },
  abstract = {This study analyses the online questions and chat messages automatically recorded by a live video streaming (LVS) system using data mining and text mining techniques. We apply data mining and text mining techniques to analyze two different datasets and then conducted an in-depth correlation analysis for two educational courses with the most online questions and chat messages respectively. The study found the discrepancies as well as similarities in the students’ patterns and themes of participation between online questions (student–instructor interaction) and online chat messages (student–students interaction or peer interaction). The results also identify disciplinary differences in students’ online participation. A correlation is found between the number of online questions students asked and students’ final grades. The data suggests that a combination of using data mining and text mining techniques for a large amount of online learning data can yield considerable insights and reveal valuable patterns in students’ learning behaviors. Limitations with data and text mining were also revealed and discussed in the paper. }
}
@article{Liu2013759,
  title = {Constrained frequent pattern mining on univariate uncertain data },
  journal = {Journal of Systems and Software },
  volume = {86},
  number = {3},
  pages = {759 - 778},
  year = {2013},
  note = {},
  issn = {0164-1212},
  doi = {http://dx.doi.org/10.1016/j.jss.2012.11.020},
  url = {http://www.sciencedirect.com/science/article/pii/S0164121212003135},
  author = {Ying-Ho Liu and Chun-Sheng Wang},
  keywords = {Frequent pattern mining},
  keywords = {Constrained mining},
  keywords = {Univariate uncertain data},
  keywords = {CUP-Miner },
  abstract = {In this paper, we propose a new algorithm called CUP-Miner (Constrained Univariate Uncertain Data Pattern Miner) for mining frequent patterns from univariate uncertain data under user-specified constraints. The discovered frequent patterns are called constrained frequent \{U2\} patterns (where “U2” represents “univariate uncertain”). In univariate uncertain data, each attribute in a transaction is associated with a quantitative interval and a probability density function. The CUP-Miner algorithm is implemented in two phases: In the first phase, a U2P-tree (Univariate Uncertain Pattern tree) is constructed by compressing the target database transactions into a compact tree structure. Then, in the second phase, the constrained frequent \{U2\} pattern is enumerated by traversing the U2P-tree with different strategies that correspond to different types of constraints. The algorithm speeds up the mining process by exploiting five constraint properties: succinctness, anti-monotonicity, monotonicity, convertible anti-monotonicity, and convertible monotonicity. Our experimental results demonstrate that CUP-Miner outperforms the modified \{CAP\} algorithm, the modified \{FIC\} algorithm, the modified U2P-Miner algorithm, and the modified Apriori algorithm. }
}
@incollection{Fong2013385,
  title = {18 - Opportunities and Challenges of Integrating Bio-Inspired Optimization and Data Mining Algorithms },
  editor = {Karamanoglu, Xin-She YangZhihua CuiRenbin XiaoAmir Hossein GandomiMehmet },
  booktitle = {Swarm Intelligence and Bio-Inspired Computation },
  publisher = {Elsevier},
  edition = {},
  address = {Oxford},
  year = {2013},
  pages = {385 - 402},
  isbn = {978-0-12-405163-8},
  doi = {http://dx.doi.org/10.1016/B978-0-12-405163-8.00018-1},
  url = {http://www.sciencedirect.com/science/article/pii/B9780124051638000181},
  author = {Simon Fong},
  keywords = {Bio-inspired},
  keywords = {Optimization},
  keywords = {Data Mining},
  keywords = {Classification},
  keywords = {Clustering},
  keywords = {Feature Selection },
  abstract = {Data mining has evolved from methods of simple statistical analysis to complex pattern recognition in the past decades. During the progression, the data mining algorithms are modified or extended in order to overcome some specific problems. This chapter discusses about the prospects of improving data mining algorithms by integrating bio-inspired optimization, which has lately captivated much of researchers’ attention. In particular, high dimensionality and the unavailability of the whole data set (as in stream mining) in the training data have known to be two major challenges. We demonstrated that these two challenges, through two small examples such as K-means clustering and time-series classification, can be overcome by integrating data mining and bio-inspired algorithms. }
}
@article{Nenonen2013215,
  title = {Analysing factors related to slipping, stumbling, and falling accidents at work: Application of data mining methods to Finnish occupational accidents and diseases statistics database },
  journal = {Applied Ergonomics },
  volume = {44},
  number = {2},
  pages = {215 - 224},
  year = {2013},
  note = {},
  issn = {0003-6870},
  doi = {http://dx.doi.org/10.1016/j.apergo.2012.07.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0003687012001081},
  author = {Noora Nenonen},
  keywords = {Occupational accident},
  keywords = {Data mining},
  keywords = {Slip},
  keywords = {Trip},
  keywords = {Fall },
  abstract = {The utilisation of data mining methods has become common in many fields. In occupational accident analysis, however, these methods are still rarely exploited. This study applies methods of data mining (decision tree and association rules) to the Finnish national occupational accidents and diseases statistics database to analyse factors related to slipping, stumbling, and falling (SSF) accidents at work from 2006 to 2007. \{SSF\} accidents at work constitute a large proportion (22%) of all accidents at work in Finland. In addition, they are more likely to result in longer periods of incapacity for work than other workplace accidents. The most important factor influencing whether or not an accident at work is related to \{SSF\} is the specific physical activity of movement. In addition, the risk of \{SSF\} accidents at work seems to depend on the occupation and the age of the worker. The results were in line with previous research. Hence the application of data mining methods was considered successful. The results did not reveal anything unexpected though. Nevertheless, because of the capability to illustrate a large dataset and relationships between variables easily, data mining methods were seen as a useful supplementary method in analysing occupational accident data. }
}
@article{He2013500,
  title = {Improving user experience with case-based reasoning systems using text mining and Web 2.0 },
  journal = {Expert Systems with Applications },
  volume = {40},
  number = {2},
  pages = {500 - 507},
  year = {2013},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.07.070},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412009268},
  author = {Wu He},
  keywords = {Case-based reasoning},
  keywords = {Text mining},
  keywords = {Web 2.0},
  keywords = {Blog},
  keywords = {Tags},
  keywords = {Case base},
  keywords = {Case library},
  keywords = {User experience},
  keywords = {User interface},
  keywords = {Sustainability },
  abstract = {Many \{CBR\} systems have been developed in the past. However, currently many \{CBR\} systems are facing a sustainability issue such as outdated cases and stagnant case growth. Some \{CBR\} systems have fallen into disuse due to the lack of new cases, case update, user participation and user engagement. To encourage the use of \{CBR\} systems and give users better experience, \{CBR\} system developers need to come up with new ways to add new features and values to the \{CBR\} systems. The author proposes a framework to use text mining and Web 2.0 technologies to improve and enhance \{CBR\} systems for providing better user experience. Two case studies were conducted to evaluate the usefulness of text mining techniques and Web 2.0 technologies for enhancing a large scale \{CBR\} system. The results suggest that text mining and Web 2.0 are promising ways to bring additional values to \{CBR\} and they should be incorporated into the \{CBR\} design and development process for the benefit of \{CBR\} users. }
}
@article{Chanda201546,
  title = {An efficient approach to mine flexible periodic patterns in time series databases },
  journal = {Engineering Applications of Artificial Intelligence },
  volume = {44},
  number = {0},
  pages = {46 - 63},
  year = {2015},
  note = {},
  issn = {0952-1976},
  doi = {http://dx.doi.org/10.1016/j.engappai.2015.04.014},
  url = {http://www.sciencedirect.com/science/article/pii/S0952197615001013},
  author = {Ashis Kumar Chanda and Swapnil Saha and Manziba Akanda Nishi and Md. Samiullah and Chowdhury Farhan Ahmed},
  keywords = {Data mining},
  keywords = {Time series databases},
  keywords = {Periodic pattern},
  keywords = {Suffix tree},
  keywords = {Flexible patterns},
  keywords = {Knowledge discovery },
  abstract = {Abstract Periodic pattern mining in time series databases is one of the most interesting data mining problems that is frequently appeared in many real-life applications. Some of the existing approaches find fixed length periodic patterns by using suffix tree structure, i.e., unable to mine flexible patterns. One of the existing approaches generates periodic patterns by skipping intermediate events, i.e., flexible patterns, using apriori based sequential pattern mining approach. Since, apriori based approaches suffer from the issues of huge amount of candidate generation and large percentage of false pattern pruning, we propose an efficient algorithm \{FPPM\} (Flexible Periodic Pattern Mining) using suffix trie data structure. The proposed algorithm can capture more effective variable length flexible periodic patterns by neglecting unimportant or undesired events and considering only the important events in an efficient way. To the best of our knowledge, ours is the first approach that simultaneously handles various starting position throughout the sequences, flexibility among events in the mined patterns and interactive tuning of period values on the go. Complexity analysis of the proposed approach and comparison with existing approaches along with analytical comparison on various issues have been performed. As well as extensive experimental analyses are conducted to evaluate the performance of proposed \{FPPM\} algorithm using real-life datasets. The proposed approach outperforms existing algorithms in terms of processing time, scalability, and quality of mined patterns. }
}
@article{Santos201364,
  title = {Opcode sequences as representation of executables for data-mining-based unknown malware detection },
  journal = {Information Sciences },
  volume = {231},
  number = {0},
  pages = {64 - 82},
  year = {2013},
  note = {Data Mining for Information Security },
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2011.08.020},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025511004336},
  author = {Igor Santos and Felix Brezo and Xabier Ugarte-Pedrero and Pablo G. Bringas},
  keywords = {Malware detection},
  keywords = {Computer security},
  keywords = {Data mining},
  keywords = {Machine learning},
  keywords = {Supervised learning },
  abstract = {Malware can be defined as any type of malicious code that has the potential to harm a computer or network. The volume of malware is growing faster every year and poses a serious global security threat. Consequently, malware detection has become a critical topic in computer security. Currently, signature-based detection is the most widespread method used in commercial antivirus. In spite of the broad use of this method, it can detect malware only after the malicious executable has already caused damage and provided the malware is adequately documented. Therefore, the signature-based method consistently fails to detect new malware. In this paper, we propose a new method to detect unknown malware families. This model is based on the frequency of the appearance of opcode sequences. Furthermore, we describe a technique to mine the relevance of each opcode and assess the frequency of each opcode sequence. In addition, we provide empirical validation that this new method is capable of detecting unknown malware. }
}
@article{DeWeerdt201357,
  title = {Process Mining for the multi-faceted analysis of business processes—A case study in a financial services organization },
  journal = {Computers in Industry },
  volume = {64},
  number = {1},
  pages = {57 - 67},
  year = {2013},
  note = {},
  issn = {0166-3615},
  doi = {http://dx.doi.org/10.1016/j.compind.2012.09.010},
  url = {http://www.sciencedirect.com/science/article/pii/S0166361512001479},
  author = {Jochen De Weerdt and Annelies Schupp and An Vanderloock and Bart Baesens},
  keywords = {Process Mining},
  keywords = {Event log analysis},
  keywords = {Real-life application},
  keywords = {Financial services industry },
  abstract = {Most organizations have some kind of process-oriented information system that keeps track of business events. Process Mining starts from event logs extracted from these systems in order to discover, analyze, diagnose and improve processes, organizational, social and data structures. Notwithstanding the large number of contributions to the process mining literature over the last decade, the number of studies actually demonstrating the applicability and value of these techniques in practice has been limited. As a consequence, there is a need for real-life case studies suggesting methodologies to conduct process mining analysis and to show the benefits of its application in real-life environments. In this paper we present a methodological framework for a multi-faceted analysis of real-life event logs based on Process Mining. As such, we demonstrate the usefulness and flexibility of process mining techniques to expose organizational inefficiencies in a real-life case study that is centered on the back office process of a large Belgian insurance company. Our analysis shows that process mining techniques constitute an ideal means to tackle organizational challenges by suggesting process improvements and creating a company-wide process awareness. }
}
@article{Valsamidis2013264,
  title = {A Framework for Opinion Mining in Blogs for Agriculture },
  journal = {Procedia Technology },
  volume = {8},
  number = {0},
  pages = {264 - 274},
  year = {2013},
  note = {6th International Conference on Information and Communication Technologies in Agriculture, Food and Environment (HAICTA 2013) },
  issn = {2212-0173},
  doi = {http://dx.doi.org/10.1016/j.protcy.2013.11.036},
  url = {http://www.sciencedirect.com/science/article/pii/S2212017313000984},
  author = {Stavros Valsamidis and Theodosios Theodosiou and Ioannis Kazanidis and Michael Nikolaidis},
  keywords = {blogs},
  keywords = {agriculture},
  keywords = {opinion mining},
  keywords = {positive and negative attitudes },
  abstract = {Abstract In recent years there is much talk about blogging and the way in which blogs influence media and change the way people communicate and share knowledge. Blogs are also at the center of attention commercially, while a large number of academic staff researches on them. Furthermore, blogs represent an important new arena in agriculture sector for knowledge discovery since farmers use them for professional reasons. Opinion mining is a kind of text mining. Its goal is to assess the attitude of the author with respect to a given subject. The attitude may be a positive or negative opinion. This paper outlines the challenges and opportunities of the blogs for agriculture in terms of analyzing the information which is stored in them. The used technique in an experiment blog with the aid of the RapidMiner software is opinion mining. This framework may thus help establish baselines for these opinion mining tasks in agriculture. }
}
@incollection{Kamath201317,
  title = {Chapter 2 - Data Mining in Materials Science and Engineering },
  editor = {Rajan, Krishna },
  booktitle = {Informatics for Materials Science and Engineering },
  publisher = {Butterworth-Heinemann},
  edition = {},
  address = {Oxford},
  year = {2013},
  pages = {17 - 36},
  isbn = {978-0-12-394399-6},
  doi = {http://dx.doi.org/10.1016/B978-0-12-394399-6.00002-3},
  url = {http://www.sciencedirect.com/science/article/pii/B9780123943996000023},
  author = {Chandrika Kamath and Ya Ju Fan},
  keywords = {Data mining},
  keywords = {Image analysis},
  keywords = {Dimension reduction},
  keywords = {Feature selection},
  keywords = {Feature transformation },
  abstract = {Data mining is the process of uncovering patterns, associations, anomalies, and statistically significant structures and events in data. It borrows and builds on ideas from many disciplines, ranging from statistics to machine learning, mathematical optimization, and signal and image processing. Data mining techniques are becoming an integral part of scientific endeavors in many application domains, including astronomy, bioinformatics, chemistry, materials science, climate, fusion, and combustion. In this chapter, we provide a brief introduction to the data mining process and some of the algorithms used in extracting information from scientific data sets. }
}
@article{Caron20131357,
  title = {Comprehensive rule-based compliance checking and risk management with process mining },
  journal = {Decision Support Systems },
  volume = {54},
  number = {3},
  pages = {1357 - 1369},
  year = {2013},
  note = {},
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/j.dss.2012.12.012},
  url = {http://www.sciencedirect.com/science/article/pii/S0167923612003788},
  author = {Filip Caron and Jan Vanthienen and Bart Baesens},
  keywords = {Business rules},
  keywords = {Compliance checking},
  keywords = {Risk management},
  keywords = {Process mining},
  keywords = {Process-aware information systems },
  abstract = {Abstract Process mining researchers have primarily focused on developing and improving process discovery techniques, while attention for the applicability of process mining has been below par. As a result, there only exists a partial fit with the traditional requirements for compliance checking and risk management. This paper proposes a comprehensive rule-based process mining approach for a timely investigation of a complete set of enriched process event data. Additionally, the contribution elaborates a two-dimensional business rule taxonomy that serves as a source of business rules for the comprehensive rule-based compliance checking approach. Finally, the study provides a formal grounding for and an evaluation of the comprehensive rule-based compliance checking approach. }
}
@article{Verma20131366,
  title = {Predicting the total suspended solids in wastewater: A data-mining approach },
  journal = {Engineering Applications of Artificial Intelligence },
  volume = {26},
  number = {4},
  pages = {1366 - 1372},
  year = {2013},
  note = {},
  issn = {0952-1976},
  doi = {http://dx.doi.org/10.1016/j.engappai.2012.08.015},
  url = {http://www.sciencedirect.com/science/article/pii/S0952197612002163},
  author = {Anoop Verma and Xiupeng Wei and Andrew Kusiak},
  keywords = {Total suspended solids},
  keywords = {Data-mining},
  keywords = {Influent flow rate},
  keywords = {Prediction},
  keywords = {Wastewater treatment },
  abstract = {Total suspended solids (TSS) are a major pollutant that affects waterways all over the world. Predicting the values of \{TSS\} is of interest to quality control of wastewater processing. Due to infrequent measurements, time series data for \{TSS\} are constructed using influent flow rate and influent carbonaceous bio-chemical oxygen demand (CBOD). We investigated different scenarios of daily average influent \{CBOD\} and influent flow rate measured at 15 min intervals. Then, we used five data-mining algorithms, i.e., multi-layered perceptron, k-nearest neighbor, multi-variate adaptive regression spline, support vector machine, and random forest, to construct day-ahead, time-series prediction models for TSS. Historical \{TSS\} values were used as input parameters to predict current and future values of TSS. A sliding-window approach was used to improve the results of the predictions. }
}
@article{Wittek2013198,
  title = {Accelerating text mining workloads in a MapReduce-based distributed \{GPU\} environment },
  journal = {Journal of Parallel and Distributed Computing },
  volume = {73},
  number = {2},
  pages = {198 - 206},
  year = {2013},
  note = {},
  issn = {0743-7315},
  doi = {http://dx.doi.org/10.1016/j.jpdc.2012.10.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0743731512002353},
  author = {Peter Wittek and Sándor Darányi},
  keywords = {\{GPU\} computing},
  keywords = {MapReduce},
  keywords = {Text mining},
  keywords = {Self-organizing maps},
  keywords = {Random projection },
  abstract = {Scientific computations have been using GPU-enabled computers successfully, often relying on distributed nodes to overcome the limitations of device memory. Only a handful of text mining applications benefit from such infrastructure. Since the initial steps of text mining are typically data intensive, and the ease of deployment of algorithms is an important factor in developing advanced applications, we introduce a flexible, distributed, MapReduce-based text mining workflow that performs I/O-bound operations on \{CPUs\} with industry-standard tools and then runs compute-bound operations on \{GPUs\} which are optimized to ensure coalesced memory access and effective use of shared memory. We have performed extensive tests of our algorithms on a cluster of eight nodes with two \{NVidia\} Tesla \{M2050s\} attached to each, and we achieve considerable speedups for random projection and self-organizing maps. }
}
@article{Hung2013775,
  title = {Web usage mining for analysing elder self-care behavior patterns },
  journal = {Expert Systems with Applications },
  volume = {40},
  number = {2},
  pages = {775 - 783},
  year = {2013},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.08.037},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412009980},
  author = {Yu-Shiang Hung and Kuei-Ling B. Chen and Chi-Ta Yang and Guang-Feng Deng},
  keywords = {Elder self-care behavior pattern},
  keywords = {Web usage mining},
  keywords = {Cluster analysis},
  keywords = {Sequential profiles},
  keywords = {Markov model},
  keywords = {Association analysis },
  abstract = {The rapid growth of the elderly population has increased the need to support elders in maintaining independent and healthy lifestyles in their homes rather than through more expensive and isolated care facilities. Self-care can improve the competence of elderly participants in managing their own health conditions without leaving home. This main purpose of this study is to understand the self-care behavior of elderly participants in a developed self-care service system that provides self-care service and to analyze the daily self-care activities and health status of elders who live at home alone. To understand elder self-care patterns, log data from actual cases of elder self-care service were collected and analysed by Web usage mining. This study analysed 3391 sessions of 157 elders for the month of March, 2012. First, self-care use cycle, time, function numbers, and the depth and extent (range) of services were statistically analysed. Association rules were then used for data mining to find relationship between these functions of self-care behavior. Second, data from interest-based representation schemes were used to construct elder sessions. The ART2-enhance K-mean algorithm was then used to mine cluster patterns. Finally, sequential profiles for elder self-care behavior patterns were captured by applying sequence-based representation schemes in association with Markov models and ART2-enhanced K-mean clustering algorithms for sequence behavior mining cluster patterns for the elders. The analysis results can be used for research in medicine, public health, nursing and psychology and for policy-making in the health care domain. }
}
@article{Won20132482,
  title = {A Design Methodology for Distributed Adaptive Stream Mining Systems },
  journal = {Procedia Computer Science },
  volume = {18},
  number = {0},
  pages = {2482 - 2491},
  year = {2013},
  note = {2013 International Conference on Computational Science },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2013.05.425},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050913005681},
  author = {Stephen Won and Inkeun Cho and Kishan Sudusinghe and Jie Xu and Yu Zhang and Mihaela van der Schaar and Shuvra S. Bhattacharyya},
  keywords = {Adaptive stream mining},
  keywords = {Dataflow graphs},
  keywords = {Distributed signal processing },
  abstract = {Abstract Data-driven, adaptive computations are key to enabling the deployment of accurate and efficient stream mining systems, which invoke suitably configured queries in real-time on streams of input data. Due to the physical separation among data sources and computational resources, it is often necessary to deploy such stream mining systems in a distributed fashion, where local learners have access to disjoint subsets of the data that is to be mined, and forward their intermediate results to an ensemble learner that combines the results from the local learners. In this paper, we develop a design methodology for integrated de- sign, simulation, and implementation of dynamic data-driven adaptive stream mining systems. By systematically integrating considerations associated with local embedded processing, classifier configuration, data-driven adaptation and networked com- munication, our approach allows for effective assessment, prototyping, and implementation of alternative distributed design methods for data-driven, adaptive stream mining systems. We demonstrate our results on a dynamic data-driven application involving patient health care monitoring. }
}
@article{Li201350,
  title = {A tree-network model for mining short message services seed users and its empirical analysis },
  journal = {Knowledge-Based Systems },
  volume = {40},
  number = {0},
  pages = {50 - 57},
  year = {2013},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2012.11.010},
  url = {http://www.sciencedirect.com/science/article/pii/S095070511200319X},
  author = {Yongli Li and Chong Wu and Xudong Wang and Shitang Wu},
  keywords = {Network analysis},
  keywords = {Seed users},
  keywords = {Evaluation},
  keywords = {Data mining},
  keywords = {Information management },
  abstract = {Identifying short message services (SMSs) seed users helps to discover the information’s originals and transmission paths. A tree-network model was proposed to depict the characteristics of \{SMS\} seed users who have such three features as “ahead of time”, “mass texting” and “numerous retransmissions”. For acquiring the established network model’s width and depth, a clustering algorithm based on density was adopted and a recursion algorithm was designed to solve such problems. An objective, comprehensive and scale-free evaluation function was further presented to rank the potential seed users by using the width and the depth obtained above. Furthermore, the model’s empirical analysis was made based on part of the Shenzhen’s cell phone \{SMS\} data in February of 2012. The model is effective and applicable as a powerful tool to solve the \{SMS\} seed users’ mining problem. }
}
@article{Cruz20133174,
  title = {‘Long autonomy or long delay?’ The importance of domain in opinion mining },
  journal = {Expert Systems with Applications },
  volume = {40},
  number = {8},
  pages = {3174 - 3184},
  year = {2013},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.12.031},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412012729},
  author = {Fermín L. Cruz and José A. Troyano and Fernando Enríquez and F. Javier Ortega and Carlos G. Vallejo},
  keywords = {Sentiment analysis},
  keywords = {Opinion mining},
  keywords = {Feature-based opinion extraction},
  keywords = {User-generated contents},
  keywords = {Information extraction },
  abstract = {Nowadays, people do not only navigate the web, but they also contribute contents to the Internet. Among other things, they write their thoughts and opinions in review sites, forums, social networks, blogs and other websites. These opinions constitute a valuable resource for businesses, governments and consumers. In the last years, some researchers have proposed opinion extraction systems, mostly domain-independent ones, to automatically extract structured representations of opinions contained in those texts. In this work, we tackle this task in a domain-oriented approach, defining a set of domain-specific resources which capture valuable knowledge about how people express opinions on a given domain. These resources are automatically induced from a set of annotated documents. Some experiments were carried out on three different domains (user-generated reviews of headphones, hotels and cars), comparing our approach to other state-of-the-art, domain-independent techniques. The results confirm the importance of the domain in order to build accurate opinion extraction systems. Some experiments on the influence of the dataset size and an example of aggregation and visualization of the extracted opinions are also shown. }
}
@article{Chiou2013405,
  title = {A two-stage mining framework to explore key risk conditions on one-vehicle crash severity },
  journal = {Accident Analysis & Prevention },
  volume = {50},
  number = {0},
  pages = {405 - 415},
  year = {2013},
  note = {},
  issn = {0001-4575},
  doi = {http://dx.doi.org/10.1016/j.aap.2012.05.017},
  url = {http://www.sciencedirect.com/science/article/pii/S0001457512002011},
  author = {Yu-Chiun Chiou and Lawrence W. Lan and Wen-Pin Chen},
  keywords = {Crash severity},
  keywords = {Genetic mining rule},
  keywords = {One-vehicle crashes},
  keywords = {Mixed logit model},
  keywords = {Stepwise rule-mining algorithm },
  abstract = {This paper proposes a two-stage mining framework to explore the key risk conditions that may have contributed to the one-vehicle crash severity in Taiwan's freeways. In the first stage, a genetic mining rule (GMR) model is developed, using a novel stepwise rule-mining algorithm, to identify the potential risk conditions that best elucidate the one-vehicle crash severity. In the second stage, a mixed logit model is estimated, using the antecedent part of the mined-rules as explanatory variables, to test the significance of the risk conditions. A total of 5563 one-vehicle crash cases (226 fatalities, 1593 injuries and 3744 property losses) occurred in Taiwan's freeways over 2003–2007 are analyzed. The \{GMR\} model has mined 29 rules for use. By incorporating these 29 mined-rules into a mixed logit model, we further identify one key safe condition and four key risk conditions leading to serious crashes (i.e., fatalities and injuries). Each key risk condition is discussed and compared with its adjacent rules. Based on the findings, some countermeasures to rectify the freeway's serious one-vehicle crashes are proposed. }
}
@article{Madani2013330,
  title = {Semi-structured Documents Mining: A Review and Comparison },
  journal = {Procedia Computer Science },
  volume = {22},
  number = {0},
  pages = {330 - 339},
  year = {2013},
  note = {17th International Conference in Knowledge Based and Intelligent Information and Engineering Systems - \{KES2013\} },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2013.09.110},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050913009034},
  author = {Amina Madani and Omar Boussaid and Djamel Eddine Zegour},
  keywords = {Semi-structured documents},
  keywords = {documents mining},
  keywords = {clustering},
  keywords = {association},
  keywords = {classification},
  keywords = {structure mining},
  keywords = {content mining },
  abstract = {Abstract The number of semi-structured documents that is produced is steadily increasing. Thus, it will be essential for discovering new knowledge from them. In this survey paper, we review popular semi-structured documents mining approaches (structure alone and both structure and content). We provide a brief description of each technique as well as efficient algorithms for implementing the technique and comparing them using different comparison criteria. }
}
@article{Ahn201212551,
  title = {Effective product assignment based on association rule mining in retail },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {16},
  pages = {12551 - 12556},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.04.086},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412006987},
  author = {Kwang-Il Ahn},
  keywords = {Data mining},
  keywords = {Association rules},
  keywords = {Cross-selling},
  keywords = {Product assignment },
  abstract = {Much academic research has been conducted about the process of association rule mining. More effort is now required for practical application of association rules in various commercial fields. A potential application of association rule mining is the problem of product assignment in retail. The product assignment problem involves how to most effectively assign items to sites in retail stores to grow sales. Effective product assignment facilitates cross-selling and convenient shopping for customers to promote maximum sales for retailers. However, little practical research has been done to address the issue. The current study approaches the product assignment problem using association rule mining for retail environments. There are some barriers to overcome in applying association rule mining to the product assignment problem for retail. This study conducts some generalizing to overcome drawbacks caused by the short lifecycles of current products. As a measure of cross-selling, lift is used to compare the effectiveness of various assignments for products. The proposed algorithm consists of three processes, which include mining associations among items, nearest neighbor assignments, and updating assignments. The algorithm was tested on synthetic databases. The results show very effective product assignment in terms of the potential for cross-selling to drive maximum sales for retailers. }
}
@article{MarreseTaylor2013182,
  title = {Identifying Customer Preferences about Tourism Products Using an Aspect-based Opinion Mining Approach },
  journal = {Procedia Computer Science },
  volume = {22},
  number = {0},
  pages = {182 - 191},
  year = {2013},
  note = {17th International Conference in Knowledge Based and Intelligent Information and Engineering Systems - \{KES2013\} },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2013.09.094},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050913008879},
  author = {Edison Marrese-Taylor and Juan D. Velásquez and Felipe Bravo-Marquez and Yutaka Matsuo},
  keywords = {opinion mining},
  keywords = {aspect-based},
  keywords = {tourism},
  keywords = {customer preferences},
  keywords = {natural language processing},
  keywords = {web mining },
  abstract = {Abstract In this study we extend Bing Liu's aspect-based opinion mining technique to apply it to the tourism domain. Using this extension, we also offer an approach for considering a new alternative to discover consumer preferences about tourism products, particularly hotels and restaurants, using opinions available on the Web as reviews. An experiment is also conducted, using hotel and restaurant reviews obtained from TripAdvisor, to evaluate our proposals. Results showed that tourism product reviews available on web sites contain valuable information about customer preferences that can be extracted using an aspect-based opinion mining approach. The proposed approach proved to be very effective in determining the sentiment orientation of opinions, achieving a precision and recall of 90%. However, on average, the algorithms were only capable of extracting 35% of the explicit aspect expressions. }
}
@article{Pinzón201315,
  title = {idMAS-SQL: Intrusion Detection Based on \{MAS\} to Detect and Block \{SQL\} injection through data mining },
  journal = {Information Sciences },
  volume = {231},
  number = {0},
  pages = {15 - 31},
  year = {2013},
  note = {Data Mining for Information Security },
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2011.06.020},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025511003148},
  author = {Cristian I. Pinzón and Juan F. De Paz and Álvaro Herrero and Emilio Corchado and Javier Bajo and Juan M. Corchado},
  keywords = {Intrusion Detection},
  keywords = {\{SQL\} injection attacks},
  keywords = {Data mining},
  keywords = {\{CBR\}},
  keywords = {\{SVM\}},
  keywords = {Neural networks },
  abstract = {This study presents a multiagent architecture aimed at detecting \{SQL\} injection attacks, which are one of the most prevalent threats for modern databases. The proposed architecture is based on a hierarchical and distributed strategy where the functionalities are structured on layers. SQL-injection attacks, one of the most dangerous attacks to online databases, are the focus of this research. The agents in each one of the layers are specialized in specific tasks, such as data gathering, data classification, and visualization. This study presents two key agents under a hybrid architecture: a classifier agent that incorporates a Case-Based Reasoning engine employing advanced algorithms in the reasoning cycle stages, and a visualizer agent that integrates several techniques to facilitate the visual analysis of suspicious queries. The former incorporates a new classification model based on a mixture of a neural network and a Support Vector Machine in order to classify \{SQL\} queries in a reliable way. The latter combines clustering and neural projection techniques to support the visual analysis and identification of target attacks. The proposed approach was tested in a real-traffic case study and its experimental results, which validate the performance of the proposed approach, are presented in this paper. }
}
@article{Tu2012303,
  title = {Indices of novelty for emerging topic detection },
  journal = {Information Processing & Management },
  volume = {48},
  number = {2},
  pages = {303 - 325},
  year = {2012},
  note = {},
  issn = {0306-4573},
  doi = {http://dx.doi.org/10.1016/j.ipm.2011.07.006},
  url = {http://www.sciencedirect.com/science/article/pii/S0306457311000768},
  author = {Yi-Ning Tu and Jia-Lang Seng},
  keywords = {Topic detection and tracking},
  keywords = {Text mining},
  keywords = {Information retrieval},
  keywords = {Novelty index},
  keywords = {Published volume index},
  keywords = {Aging theory },
  abstract = {Emerging topic detection is a vital research area for researchers and scholars interested in searching for and tracking new research trends and topics. The current methods of text mining and data mining used for this purpose focus only on the frequency of which subjects are mentioned, and ignore the novelty of the subject which is also critical, but beyond the scope of a frequency study. This work tackles this inadequacy to propose a new set of indices for emerging topic detection. They are the novelty index (NI) and the published volume index (PVI). This new set of indices is created based on time, volume, frequency and represents a resolution to provide a more precise set of prediction indices. They are then utilized to determine the detection point (DP) of new emerging topics. Following the detection point, the intersection decides the worth of a new topic. The algorithms presented in this paper can be used to decide the novelty and life span of an emerging topic in a specific field. The entire comprehensive collection of the \{ACM\} Digital Library is examined in the experiments. The application of the \{NI\} and \{PVI\} gives a promising indication of emerging topics in conferences and journals. }
}
@article{Lee201213338,
  title = {Unsupervised and supervised learning to evaluate event relatedness based on content mining from social-media streams },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {18},
  pages = {13338 - 13356},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.05.068},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412007841},
  author = {Chung-Hong Lee},
  keywords = {Stream mining},
  keywords = {Data mining},
  keywords = {Event evaluation},
  keywords = {Social networks },
  abstract = {Due to the explosive growth of social-media applications, enhancing event-awareness by social mining has become extremely important. The contents of microblogs preserve valuable information associated with past disastrous events and stories. To learn the experiences from past events for tackling emerging real-world events, in this work we utilize the social-media messages to characterize real-world events through mining their contents and extracting essential features for relatedness analysis. On one hand, we established an online clustering approach on Twitter microblogs for detecting emerging events, and meanwhile we performed event relatedness evaluation using an unsupervised clustering approach. On the other hand, we developed a supervised learning model to create extensible measure metrics for offline evaluation of event relatedness. By means of supervised learning, our developed measure metrics are able to compute relatedness of various historical events, allowing the event impacts on specified domains to be quantitatively measured for event comparison. By combining the strengths of both methods, the experimental results showed that the combined framework in our system is sensible for discovering more unknown knowledge about event impacts and enhancing event awareness. }
}
@article{Quartulli201311,
  title = {A review of \{EO\} image information mining },
  journal = {\{ISPRS\} Journal of Photogrammetry and Remote Sensing },
  volume = {75},
  number = {0},
  pages = {11 - 28},
  year = {2013},
  note = {},
  issn = {0924-2716},
  doi = {http://dx.doi.org/10.1016/j.isprsjprs.2012.09.010},
  url = {http://www.sciencedirect.com/science/article/pii/S0924271612001797},
  author = {Marco Quartulli and Igor G. Olaizola},
  keywords = {Remote sensing},
  keywords = {Databases},
  keywords = {Content-based image retrieval},
  keywords = {\{EO\} mining},
  keywords = {Information retrieval },
  abstract = {We analyze the state of the art of content-based retrieval in Earth observation image archives focusing on complete systems showing promise for operational implementation. The different paradigms at the basis of the main system families are introduced. The approaches taken are considered, focusing in particular on the phases after primitive feature extraction. The solutions envisaged for the issues related to feature simplification and synthesis, indexing, semantic labeling are reviewed. The methodologies for query specification and execution are evaluated. Conclusions are drawn on the state of published research in Earth observation (EO) mining. }
}
@article{JunquédeFortuny201211616,
  title = {Media coverage in times of political crisis: A text mining approach },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {14},
  pages = {11616 - 11622},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.04.013},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412006100},
  author = {Enric Junqué de Fortuny and Tom De Smedt and David Martens and Walter Daelemans},
  keywords = {Politics},
  keywords = {Sentiment mining},
  keywords = {Opinion mining},
  keywords = {Data mining},
  keywords = {Coverage },
  abstract = {At the year end of 2011 Belgium formed a government, after a world record breaking period of 541 days of negotiations. We have gathered and analysed 68,000 related on-line news articles published in 2011 in Flemish newspapers. These articles were analysed by a custom-built expert system. The results of our text mining analyses show interesting differences in media coverage and votes for several political parties and politicians. With opinion mining, we are able to automatically detect the sentiment of each article, thereby allowing to visualise how the tone of reporting evolved throughout the year, on a party, politician and newspaper level. Our suggested framework introduces a generic text mining approach to analyse media coverage on political issues, including a set of methodological guidelines, evaluation metrics, as well as open source opinion mining tools. Since all analyses are based on automated text mining algorithms, an objective overview of the manner of reporting is provided. The analysis shows peaks of positive and negative sentiments during key moments in the negotiation process. }
}
@article{Sha2013812,
  title = {EPLogCleaner: Improving Data Quality of Enterprise Proxy Logs for Efficient Web Usage Mining },
  journal = {Procedia Computer Science },
  volume = {17},
  number = {0},
  pages = {812 - 818},
  year = {2013},
  note = {First International Conference on Information Technology and Quantitative Management },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2013.05.104},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050913002378},
  author = {Hongzhou Sha and Tingwen Liu and Peng Qin and Yong Sun and Qingyun Liu},
  keywords = {Web usage mining},
  keywords = {Data cleaning},
  keywords = {Enterprise proxy logs },
  abstract = {Abstract Data cleaning is an important step performed in the preprocessing stage of web usage mining, and is widely used in many data mining systems. Despite many efforts on data cleaning for web server logs, it is still an open question for enterprise proxy logs. With unlimited accesses to websites, enterprise proxy logs trace web requests from multiple clients to multiple web servers,which make them quite different from web sever logs on both location and content. Therefore, many irrelevant items such as software updating requests cannot be filtered out by traditional data cleaning methods. In this paper, we propose the first method named \{EPLogCleaner\} that can filter out plenty of irrelevant items based on the common prefix of their URLs. We make an evaluation of \{EPLogCleaner\} with a real network traffic trace captured from one enterprise proxy. Experimental results show that \{EPLogCleaner\} can improve data quality of enterprise proxy logs by further filtering out more than 30% \{URL\} requests comparing with traditional data cleaning methods. }
}
@incollection{Stanković2013223,
  title = {Chapter 4 - A Classification of Data Mining Algorithms for Wireless Sensor Networks, and Classification Extension to Concept Modeling in System of Wireless Sensor Networks Based on Natural Language Processing },
  editor = {Ali Hurson},
  booktitle = {Connected Computing Environment},
  publisher = {Elsevier},
  year = {2013},
  volume = {90},
  pages = {223 - 283},
  series = {Advances in Computers },
  issn = {0065-2458},
  doi = {http://dx.doi.org/10.1016/B978-0-12-408091-1.00004-X},
  url = {http://www.sciencedirect.com/science/article/pii/B978012408091100004X},
  author = {StašaVujičić Stanković and Nemanja Kojić and Goran Rakočević and Duško Vitas and Veljko Milutinović},
  keywords = {Data mining},
  keywords = {Wireless sensor networks},
  keywords = {Concept modeling},
  keywords = {Natural language processing},
  keywords = {Information extraction},
  keywords = {Named entity recognition },
  abstract = {Abstract In this article, we propose one original classification and one extension thereof, which takes into consideration the relevant issues in Natural Language Processing. The newly introduced classification of Data Mining algorithms is on the level of a single Wireless Sensor Network and its extension to Concept Modeling on the level of a System of Wireless Sensor Networks. Most of the scientists in this field put emphasis on issues related to applications of Wireless Sensor Networks in different areas, while we here put emphasis on categorization of the selected approaches from the open literature, to help application designers/developers get a better understanding of their options in different areas. Our main goal is to provide a good starting point for a more effective analysis leading to possible new solutions, possible improvements of existing solutions, and possible combination of two or more of the existing solutions into new ones, using the hybridization principle. Another contribution of this article is a synergistic interdisciplinary review of problems in two areas: Data Mining and Natural Language Processing. This enables interoperability improvements on the interface between Wireless Sensor Networks that often share data in native natural languages. }
}
@article{Li201323,
  title = {A fuzzy conceptualization model for text mining with application in opinion polarity classification },
  journal = {Knowledge-Based Systems },
  volume = {39},
  number = {0},
  pages = {23 - 33},
  year = {2013},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2012.10.005},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705112002742},
  author = {Sheng-Tun Li and Fu-Ching Tsai},
  keywords = {Text classification},
  keywords = {Fuzzy formal concept analysis},
  keywords = {Text mining},
  keywords = {Sentiment analysis},
  keywords = {Feature extraction },
  abstract = {Automatic text classification in text mining is a critical technique to manage huge collections of documents. However, most existing document classification algorithms are easily affected by ambiguous terms. The ability to disambiguate for a classifier is thus as important as the ability to classify accurately. In this paper, we propose a novel classification framework based on fuzzy formal concept analysis to conceptualize documents into a more abstract form of concepts, and use these as the training examples to alleviate the arbitrary outcomes caused by ambiguous terms. The proposed model is evaluated on a benchmark testbed and two opinion polarity datasets. The experimental results indicate superior performance in all datasets. Applying concept analysis to opinion polarity classification is a leading endeavor in the disambiguation of Web 2.0 contents, and the approach presented in this paper offers significant improvements on current methods. The results of the proposed model reveal its ability to decrease the sensitivity to noise, as well as its adaptability in cross domain applications. }
}
@article{Kardan201393,
  title = {A novel approach to hybrid recommendation systems based on association rules mining for content recommendation in asynchronous discussion groups },
  journal = {Information Sciences },
  volume = {219},
  number = {0},
  pages = {93 - 110},
  year = {2013},
  note = {},
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2012.07.011},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025512004756},
  author = {Ahmad A. Kardan and Mahnaz Ebrahimi},
  keywords = {[Abbreviations: WSD},
  keywords = {Word Sense Disambiguation},
  keywords = {CF},
  keywords = {collaborative filtering},
  keywords = {CBF},
  keywords = {content-based filtering},
  keywords = {CSCL},
  keywords = {computer supported collaborative learning},
  keywords = {HF},
  keywords = {Hybrid Filtering},
  keywords = {MAE},
  keywords = {Mean Absolute Error,},
  keywords = {Recommender system},
  keywords = {Asynchronous discussion group},
  keywords = {Collaborative filtering},
  keywords = {Content-based filtering},
  keywords = {Hybrid recommender system},
  keywords = {Association rules mining] },
  abstract = {Recommender systems have been developed in variety of domains, including asynchronous discussion group which is one of the most interesting ones. Due to the information overload and its varieties in discussion groups, it is difficult to draw out the relevant information. Therefore, recommender systems play an important role in filtering and customizing the desired information. Nowadays, collaborative and content-based filtering are the most adopted techniques being utilized in recommender systems. The collaborative filtering technique recommends items based on liked-mind users’ opinions and users’ preferences. Alternatively, the aim of the content-based filtering technique is the identification of items which are similar to those a user has preferred in past. To overcome the drawbacks of the aforementioned techniques, a hybrid recommender system combines two or more recommendation techniques to obtain more accuracy. The most important achievement of this study is to present a novel approach in hybrid recommendation systems, which identifies the user similarity neighborhood from implicit information being collected in a discussion group. In the proposed system, initially the association rules mining technique is applied to discover the similar users, and then the related posts are recommended to them. To select the appropriate contents in the transacted posts, it is necessary to focus on the concepts rather than the key words. Therefore, to locate the semantic related concepts Word Sense Disambiguation strategy based on WordNet lexical database is exploited. The experiments carried out on the discussion group datasets proved a noticeable improvement on the accuracy of useful posts recommended to the users in comparison to content-based and the collaborative filtering techniques as well. }
}
@article{Wang2015122,
  title = {Analyzing internet topics by visualizing microblog retweeting },
  journal = {Journal of Visual Languages & Computing },
  volume = {28},
  number = {0},
  pages = {122 - 133},
  year = {2015},
  note = {},
  issn = {1045-926X},
  doi = {http://dx.doi.org/10.1016/j.jvlc.2014.11.007},
  url = {http://www.sciencedirect.com/science/article/pii/S1045926X14001335},
  author = {Changbo Wang and Yuhua Liu and Zhao Xiao and Aoying Zhou and Kang Zhang},
  keywords = {Microblog retweeting},
  keywords = {Internet topics},
  keywords = {Visualization},
  keywords = {Analyzing },
  abstract = {Abstract Microblog is a large-scale information sharing platform where retweeting plays an important role in information diffusion. Analyzing retweeting evolutions can help reasoning about the trend of public opinions. Information visualization techniques are used to demonstrate the retweeting behavior in order to understand how Internet topics diffuse on Microblogs. First, a graph clustering method is used to analyze the retweeting relationships among people of different occupations. Then a new algorithm based on electric field is proposed to visualize the layout of the relationship links. A prediction method based on three diffusion models is presented to predict the number of retweets over time. Finally, three real world case studies show the validity of our methods. }
}
@article{Colak2012241,
  title = {Data mining and wind power prediction: A literature review },
  journal = {Renewable Energy },
  volume = {46},
  number = {0},
  pages = {241 - 247},
  year = {2012},
  note = {},
  issn = {0960-1481},
  doi = {http://dx.doi.org/10.1016/j.renene.2012.02.015},
  url = {http://www.sciencedirect.com/science/article/pii/S0960148112001541},
  author = {Ilhami Colak and Seref Sagiroglu and Mehmet Yesilbudak},
  keywords = {Data mining},
  keywords = {Data mining techniques},
  keywords = {Wind power prediction},
  keywords = {Prediction time scales and models},
  keywords = {Literature evaluation },
  abstract = {Wind power generated by wind turbines has a non-schedulable nature due to the stochastic nature of meteorological conditions. Hence, wind power predictions are required for few seconds to one week ahead in turbine control, load tracking, pre-load sharing, power system management and energy trading. In order to overcome problems in the predictions, many different wind power prediction models have been used to achieve in the literature. Data mining and its applications have more attention in recent years. This paper presents a review study banned on very short-term, short-term, medium-term and long-term wind power predictions. The studies available in the literature have been evaluated and criticized in consideration with their prediction accuracies and deficiencies. It is shown that adaptive neuro-fuzzy inference systems, neural networks and multilayer perceptrons give better results in wind power predictions. }
}
@article{GhediniRalha201211642,
  title = {A multi-agent data mining system for cartel detection in Brazilian government procurement },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {14},
  pages = {11642 - 11656},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.04.037},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412006343},
  author = {Célia Ghedini Ralha and Carlos Vinícius Sarmento Silva},
  keywords = {Multi-agent data mining system},
  keywords = {Cartel detection},
  keywords = {Brazilian government procurement},
  keywords = {\{AGMI\}},
  keywords = {Multi-agent},
  keywords = {Distributed data mining},
  keywords = {Database knowledge discovery },
  abstract = {The main focus of this research project is the problem of extracting useful information from the Brazilian federal procurement process databases used by government auditors in the process of corruption detection and prevention to identify cartel formation among applicants. Extracting useful information to enhance cartel detection is a complex problem from many perspectives due to the large volume of data used to correlate information and the dynamic and diversified strategies companies use to hide their fraudulent operations. To attack the problem of data volume, we have used two data mining model functions, clustering and association rules, and a multi-agent approach to address the dynamic strategies of companies that are involved in cartel formation. To integrate both solutions, we have developed AGMI, an agent-mining tool that was validated using real data from the Brazilian Office of the Comptroller General, an institution of government auditing, where several measures are currently used to prevent and fight corruption. Our approach resulted in explicit knowledge discovery because \{AGMI\} presented many association rules that provided a 90% correct identification of cartel formation, according to expert assessment. According to auditing specialists, the extracted knowledge could help in the detection, prevention and monitoring of cartels that act in public procurement processes. }
}
@article{Thorleuchter201213026,
  title = {Predicting e-commerce company success by mining the text of its publicly-accessible website },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {17},
  pages = {13026 - 13034},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.05.096},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412008123},
  author = {Dirk Thorleuchter and Dirk Van den Poel},
  keywords = {Success factor},
  keywords = {E-commerce},
  keywords = {\{LSI\}},
  keywords = {Classification},
  keywords = {Text mining},
  keywords = {Website },
  abstract = {We analyze the impact of textual information from e-commerce companies’ websites on their commercial success. The textual information is extracted from web content of e-commerce companies divided into the Top 100 worldwide most successful companies and into the Top 101 to 500 worldwide most successful companies. It is shown that latent semantic concepts extracted from the analysis of textual information can be adopted as success factors for a Top 100 e-commerce company classification. This contributes to the existing literature concerning e-commerce success factors. As evaluation, a regression model based on these concepts is built that is successful in predicting the commercial success of the Top 100 companies. These findings are valuable for e-commerce websites creation. }
}
@article{Khare201229,
  title = {Decision support for improved service effectiveness using domain aware text mining },
  journal = {Knowledge-Based Systems },
  volume = {33},
  number = {0},
  pages = {29 - 40},
  year = {2012},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2012.03.005},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705112000731},
  author = {Vineet R. Khare and Rahul Chougule},
  keywords = {Decision support systems},
  keywords = {Association mining},
  keywords = {Text mining},
  keywords = {Anomaly detection},
  keywords = {Semantic text analysis },
  abstract = {This paper presents a decision support system ‘Domain Aware Text & Association Mining (DATAM)’ which has been developed to improve after-sales service and repairs for the automotive domain. A novel approach that compares textual and non-textual data for anomaly detection is proposed. It combines association and ontology based text mining. Association mining has been employed to identify the repairs performed in the field for a given symptom, whereas, text mining is used to infer repairs from the textual instructions mentioned in service documents for the same symptom. These in turn are compared and contrasted to identify the anomalous cases. The developed approach has been applied to automotive field data. Using the top 20 most frequent symptoms, observed in a mid-sized sedan built and sold in North America, it is demonstrated that \{DATAM\} can identify all the anomalous symptom – repair code combinations (with a false positive rate of 0.04). This knowledge, in the form of anomalies, can subsequently be used to improve the service/trouble-shooting procedure and identify technician training needs. }
}
@article{Batista2012209,
  title = {Multi-element determination in Brazilian honey samples by inductively coupled plasma mass spectrometry and estimation of geographic origin with data mining techniques },
  journal = {Food Research International },
  volume = {49},
  number = {1},
  pages = {209 - 215},
  year = {2012},
  note = {},
  issn = {0963-9969},
  doi = {http://dx.doi.org/10.1016/j.foodres.2012.07.015},
  url = {http://www.sciencedirect.com/science/article/pii/S096399691200258X},
  author = {B.L. Batista and L.R.S. da Silva and B.A. Rocha and J.L. Rodrigues and A.A. Berretta-Silva and T.O. Bonates and V.S.D. Gomes and R.M. Barbosa and F. Barbosa},
  keywords = {Data mining},
  keywords = {Classification},
  keywords = {Pattern recognition},
  keywords = {Trace elements},
  keywords = {Honey},
  keywords = {ICP-MS },
  abstract = {Multi-element analysis of honey samples was carried out with the aim of developing a reliable method of tracing the origin of honey. Forty-two chemical elements were determined (Al, Cu, Pb, Zn, Mn, Cd, Tl, Co, Ni, Rb, Ba, Be, Bi, U, V, Fe, Pt, Pd, Te, Hf, Mo, Sn, Sb, P, La, Mg, I, Sm, Tb, Dy, Sd, Th, Pr, Nd, Tm, Yb, Lu, Gd, Ho, Er, Ce, Cr) by inductively coupled plasma mass spectrometry (ICP-MS). Then, three machine learning tools for classification and two for attribute selection were applied in order to prove that it is possible to use data mining tools to find the region where honey originated. Our results clearly demonstrate the potential of Support Vector Machine (SVM), Multilayer Perceptron (MLP) and Random Forest (RF) chemometric tools for honey origin identification. Moreover, the selection tools allowed a reduction from 42 trace element concentrations to only 5. }
}
@article{Huang201235,
  title = {On mining clinical pathway patterns from medical behaviors },
  journal = {Artificial Intelligence in Medicine },
  volume = {56},
  number = {1},
  pages = {35 - 50},
  year = {2012},
  note = {},
  issn = {0933-3657},
  doi = {http://dx.doi.org/10.1016/j.artmed.2012.06.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0933365712000656},
  author = {Zhengxing Huang and Xudong Lu and Huilong Duan},
  keywords = {Clinical pathway analysis},
  keywords = {Pattern mining},
  keywords = {Process mining},
  keywords = {Clinical workflow log },
  abstract = {Objective Clinical pathway analysis, as a pivotal issue in ensuring specialized, standardized, normalized and sophisticated therapy procedures, is receiving increasing attention in the field of medical informatics. Clinical pathway pattern mining is one of the most important components of clinical pathway analysis and aims to discover which medical behaviors are essential/critical for clinical pathways, and also where temporal orders of these medical behaviors are quantified with numerical bounds. Even though existing clinical pathway pattern mining techniques can tell us which medical behaviors are frequently performed and in which order, they seldom precisely provide quantified temporal order information of critical medical behaviors in clinical pathways. Methods This study adopts process mining to analyze clinical pathways. The key contribution of the paper is to develop a new process mining approach to find a set of clinical pathway patterns given a specific clinical workflow log and minimum support threshold. The proposed approach not only discovers which critical medical behaviors are performed and in which order, but also provides comprehensive knowledge about quantified temporal orders of medical behaviors in clinical pathways. Results The proposed approach is evaluated via real-world data-sets, which are extracted from Zhejiang Huzhou Central hospital of China with regard to six specific diseases, i.e., bronchial lung cancer, gastric cancer, cerebral hemorrhage, breast cancer, infarction, and colon cancer, in two years (2007.08–2009.09). As compared to the general sequence pattern mining algorithm, the proposed approach consumes less processing time, generates quite a smaller number of clinical pathway patterns, and has a linear scalability in terms of execution time against the increasing size of data sets. Conclusion The experimental results indicate the applicability of the proposed approach, based on which it is possible to discover clinical pathway patterns that can cover most frequent medical behaviors that are most regularly encountered in clinical practice. Therefore, it holds significant promise in research efforts related to the analysis of clinical pathways. }
}
@article{Silva2012108,
  title = {Finding occupational accident patterns in the extractive industry using a systematic data mining approach },
  journal = {Reliability Engineering & System Safety },
  volume = {108},
  number = {0},
  pages = {108 - 122},
  year = {2012},
  note = {},
  issn = {0951-8320},
  doi = {http://dx.doi.org/10.1016/j.ress.2012.07.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0951832012001354},
  author = {Joaquim F. Silva and Celeste Jacinto},
  keywords = {Extractive Industry},
  keywords = {Mining and quarrying},
  keywords = {Occupational accident},
  keywords = {Accident pattern},
  keywords = {Eurostat data},
  keywords = {Data mining },
  abstract = {This paper deals with occupational accident patterns of in the Portuguese Extractive Industry. It constitutes a significant advance with relation to a previous study made in 2008, both in terms of methodology and extended knowledge on the patterns’ details. This work uses more recent data (2005–2007) and this time the identification of the “typical accident” shifts from a bivariate, to a multivariate pattern, for characterising more accurately the accident mechanisms. Instead of crossing only two variables (Deviation x Contact), the new methodology developed here uses data mining techniques to associate nine variables, through their categories, and to quantify the statistical cohesion of each pattern. The results confirmed the “typical accident” of the 2008 study, but went much further: it reveals three statistically significant patterns (the top-3 categories in frequency); moreover, each pattern includes now more variables (4–5 categories) and indicates their statistical cohesion. This approach allowed a more accurate vision of the reality, which is fundamental for risk management. The methodology is best suited for large groups, such as national Authorities, Insurers or Corporate Groups, to assist them planning target-oriented safety strategies. Not least importantly, researchers can apply the same algorithm to other study areas, as it is not restricted to accidents, neither to safety. }
}
@article{Chen20129588,
  title = {Comparison of feature-level learning methods for mining online consumer reviews },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {10},
  pages = {9588 - 9601},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.02.158},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412004216},
  author = {Li Chen and Luole Qi and Feng Wang},
  keywords = {Consumer reviews},
  keywords = {E-commerce},
  keywords = {Feature-level opinion mining},
  keywords = {Conditional Random Fields (CRFs)},
  keywords = {Lexicalized Hidden Markov Model (L-HMMs)},
  keywords = {Association rule mining },
  abstract = {The tasks of feature-level opinion mining usually include the extraction of product entities from consumer reviews, the identification of opinion words that are associated with the entities, and the determining of these opinions’ polarities (e.g., positive, negative, or neutral). In recent years, two major approaches have been proposed to determine opinions at the feature level: model based methods such as the one based on lexicalized Hidden Markov Model (L-HMMs), and statistical methods like the association rule mining based technique. However, little work has compared these algorithms regarding their practical abilities in identifying various types of review elements, such as features, opinions, intensifiers, entity phrases and infrequent entities. On the other hand, little attentions has been paid to applying more discriminative learning models to accomplish these opinion mining tasks. In this paper, we not only experimentally compared these methods based on a real-world review dataset, but also in particular adopted the Conditional Random Fields (CRFs) model and evaluated its performance in comparison with related algorithms. Moreover, for CRFs-based mining algorithm, we tested the role of a self-tagging process in two automatic training conditions, and further identified the ideal combination of learning functions to optimize its learning performance. The comparative experiment eventually revealed the CRFs-based method’s outperforming accuracy in terms of mining multiple review elements, relative to other methods. }
}
@article{Liao201211303,
  title = {Data mining techniques and applications – A decade review from 2000 to 2011 },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {12},
  pages = {11303 - 11311},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.02.063},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412003077},
  author = {Shu-Hsien Liao and Pei-Hui Chu and Pei-Yuan Hsiao},
  keywords = {Data mining},
  keywords = {Data mining techniques},
  keywords = {Data mining applications},
  keywords = {Literature survey },
  abstract = {In order to determine how data mining techniques (DMT) and their applications have developed, during the past decade, this paper reviews data mining techniques and their applications and development, through a survey of literature and the classification of articles, from 2000 to 2011. Keyword indices and article abstracts were used to identify 216 articles concerning \{DMT\} applications, from 159 academic journals (retrieved from five online databases), this paper surveys and classifies DMT, with respect to the following three areas: knowledge types, analysis types, and architecture types, together with their applications in different research and practical domains. A discussion deals with the direction of any future developments in \{DMT\} methodologies and applications: (1) \{DMT\} is finding increasing applications in expertise orientation and the development of applications for \{DMT\} is a problem-oriented domain. (2) It is suggested that different social science methodologies, such as psychology, cognitive science and human behavior might implement DMT, as an alternative to the methodologies already on offer. (3) The ability to continually change and acquire new understanding is a driving force for the application of \{DMT\} and this will allow many new future applications. }
}
@article{Liu2012320,
  title = {Mining event logs to support workflow resource allocation },
  journal = {Knowledge-Based Systems },
  volume = {35},
  number = {0},
  pages = {320 - 331},
  year = {2012},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2012.05.010},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705112001542},
  author = {Tingyu Liu and Yalong Cheng and Zhonghua Ni},
  keywords = {Workflow},
  keywords = {Resource allocation},
  keywords = {Data mining},
  keywords = {Process mining},
  keywords = {Association rules },
  abstract = {Currently, workflow technology is widely used to facilitate the business process in enterprise information systems (EIS), and it has the potential to reduce design time, enhance product quality and decrease product cost. However, significant limitations still exist: as an important task in the context of workflow, many present resource allocation (also known as “staff assignment”) operations are still performed manually, which are time-consuming. This paper presents a data mining approach to address the resource allocation problem (RAP) and improve the productivity of workflow resource management. Specifically, an Apriori-like algorithm is used to find the frequent patterns from the event log, and association rules are generated according to predefined resource allocation constraints. Subsequently, a correlation measure named lift is utilized to annotate the negatively correlated resource allocation rules for resource reservation. Finally, the rules are ranked using the confidence measures as resource allocation rules. Comparative experiments are performed using C4.5, SVM, ID3, Naïve Bayes and the presented approach, and the results show that the presented approach is effective in both accuracy and candidate resource recommendations. }
}
@article{Tsai201267,
  title = {Determinants of intangible assets value: The data mining approach },
  journal = {Knowledge-Based Systems },
  volume = {31},
  number = {0},
  pages = {67 - 77},
  year = {2012},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2012.02.007},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705112000470},
  author = {Chih-Fong Tsai and Yu-Hsin Lu and David C. Yen},
  keywords = {Feature selection},
  keywords = {Data mining},
  keywords = {Firm value},
  keywords = {Intangible assets value},
  keywords = {Neural networks },
  abstract = {It is very important for investors and creditors to understand the critical factors affecting a firm’s value before making decisions about investments and loans. Since the knowledge-based economy has evolved, the method for creating firm value has transferred from traditional physical assets to intangible knowledge. Therefore, valuation of intangible assets has become a widespread topic of interest in the future of the economy. This study takes advantage of feature selection, an important data-preprocessing step in data mining, to identify important and representative factors affecting intangible assets. Particularly, five feature selection methods are considered, which include principal component analysis (PCA), stepwise regression (STEPWISE), decision trees (DT), association rules (AR), and genetic algorithms (GA). In addition, multi-layer perceptron (MLP) neural networks are used as the prediction model in order to understand which features selected from these five methods can allow the prediction model to perform best. Based on the chosen dataset containing 61 variables, the experimental result shows that combining the results from multiple feature selection methods performs the best. GA ∩ STEPWISE, DT ∪ PCA, and the \{DT\} single feature selection method generate approximately 75% prediction accuracy, which select 26, 22, and 7 variables respectively. }
}
@article{Lee20128954,
  title = {An information fusion approach to integrate image annotation and text mining methods for geographic knowledge discovery },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {10},
  pages = {8954 - 8967},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.02.028},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412002722},
  author = {Chung-Hong Lee and Shih-Hao Wang},
  keywords = {Geographic knowledge discovery},
  keywords = {Image annotation},
  keywords = {Text mining},
  keywords = {Information fusion},
  keywords = {Machine learning },
  abstract = {Due to the steady increase in the number of heterogeneous types of location information on the internet, it is hard to organize a complete overview of the geospatial information for the tasks of knowledge acquisition related to specific geographic locations. The text- and photo-types of geographical dataset contain numerous location data, such as location-based tourism information, therefore defining high dimensional spaces of attributes that are highly correlated. In this work, we utilized text- and photo-types of location information with a novel approach of information fusion that exploits effective image annotation and location based text-mining approaches to enhance identification of geographic location and spatial cognition. In this paper, we describe our feature extraction methods to annotating images, and utilizing text mining approach to analyze images and texts simultaneously, in order to carry out geospatial text mining and image classification tasks. Subsequently, photo-images and textual documents are projected to a unified feature space, in order to generate a co-constructed semantic space for information fusion. Also, we employed text mining approaches to classify documents into various categories based upon their geospatial features, with the aims to discovering relationships between documents and geographical zones. The experimental results show that the proposed method can effectively enhance the tasks of location based knowledge discovery. }
}
@incollection{TamaddoniNezhad2013225,
  title = {Chapter Four - Construction and Validation of Food Webs Using Logic-Based Machine Learning and Text Mining },
  editor = {Guy Woodward and David A. Bohan},
  booktitle = {Ecological Networks in an Agricultural World},
  publisher = {Academic Press},
  year = {2013},
  volume = {49},
  pages = {225 - 289},
  series = {Advances in Ecological Research },
  issn = {0065-2504},
  doi = {http://dx.doi.org/10.1016/B978-0-12-420002-9.00004-4},
  url = {http://www.sciencedirect.com/science/article/pii/B9780124200029000044},
  author = {Alireza Tamaddoni-Nezhad and Ghazal Afroozi Milani and Alan Raybould and Stephen Muggleton and David A. Bohan},
  keywords = {Network ecology},
  keywords = {Food web},
  keywords = {Machine learning},
  keywords = {Text mining},
  keywords = {Abductive/Inductive logic programming},
  keywords = {Functional food web },
  abstract = {Abstract Network ecology holds great promise as an approach to modelling and predicting the effects of agricultural management on ecosystem service provision, as it bridges the gap between community and ecosystem ecology. Unfortunately, trophic interactions between most species in agricultural farmland are not well characterised empirically, and only partial food webs are available for a few systems. Large agricultural datasets of the nodes (i.e., species) in the webs are now available, and if these can be enriched with information on the links between them then the current shortage of network data can potentially be overcome. We demonstrate that a logic-based machine learning method can be used to automatically assign interactions between nodes, thereby generating plausible and testable food webs from ecological census data. Many of the learned trophic links were corroborated by the literature: in particular, links ascribed with high probability by machine learning corresponded with those having multiple references in the literature. In some cases, previously unobserved but high probability links were suggested and subsequently confirmed by other research groups. We evaluate these food webs using a new cross-validation method and present new results on automatic corroboration of a large, complex food web. The simulated frequencies of trophic links were also correlated with the total number of literature ‘hits’ for these links from the automatic corroboration. Finally, we also show that a network constructed by learning trophic links between functional groups is at least as accurate as the species-based trophic network. }
}
@article{Lo2012743,
  title = {Mining quantified temporal rules: Formalism, algorithms, and evaluation },
  journal = {Science of Computer Programming },
  volume = {77},
  number = {6},
  pages = {743 - 759},
  year = {2012},
  note = {(1) Coordination 2009 (2) \{WCRE\} 2009 },
  issn = {0167-6423},
  doi = {http://dx.doi.org/10.1016/j.scico.2010.10.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0167642310001875},
  author = {David Lo and G. Ramalingam and Venkatesh-Prasad Ranganath and Kapil Vaswani},
  keywords = {Specification mining},
  keywords = {Temporal rules},
  keywords = {Quantification},
  keywords = {Dynamic analysis},
  keywords = {Reverse engineering },
  abstract = {Libraries usually impose constraints on how clients should use them. Often these constraints are not well-documented. In this paper, we address the problem of recovering such constraints automatically, a problem referred to as specification mining. Given some client programs that use a given library, we identify constraints on the library usage that are (almost) satisfied by the given set of clients. The class of rules we target for mining combines simple binary temporal operators with state predicates (composed of equality constraints) and quantification. This is a simple yet expressive subclass of temporal properties (LTL formulae) that allows us to capture many common \{API\} usage rules. We focus on recovering rules from execution traces and apply classical data mining concepts to be robust against bugs (API usage rule violations) in clients. We present new algorithms for mining rules from execution traces. We show how a propositional rule mining algorithm can be generalized to treat quantification and state predicates in a unified way. Our approach enables the miner to be complete (i.e. , mine all rules within the targeted class that are satisfied by the given traces) while avoiding an exponential blowup. We have implemented these algorithms and used them to mine \{API\} usage rules for several Windows APIs. Our experiments show the efficiency and effectiveness of our approach. }
}
@article{Capozzoli20154324,
  title = {Fault detection analysis using data mining techniques for a cluster of smart office buildings },
  journal = {Expert Systems with Applications },
  volume = {42},
  number = {9},
  pages = {4324 - 4338},
  year = {2015},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2015.01.010},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417415000251},
  author = {Alfonso Capozzoli and Fiorella Lauro and Imran Khan},
  keywords = {Smart building},
  keywords = {\{ANN\}},
  keywords = {Pattern recognition},
  keywords = {Fault detection },
  abstract = {Abstract There is an increasing need for automated fault detection tools in buildings. The total energy request in buildings can be significantly reduced by detecting abnormal consumption effectively. Numerous models are used to tackle this problem but either they are very complex and mostly applicable to components level, or they cannot be adopted for different buildings and equipment. In this study a simplified approach to automatically detect anomalies in building energy consumption based on actual recorded data of active electrical power for lighting and total active electrical power of a cluster of eight buildings is presented. The proposed methodology uses statistical pattern recognition techniques and artificial neural ensembling networks coupled with outliers detection methods for fault detection. The results show the usefulness of this data analysis approach in automatic fault detection by reducing the number of false anomalies. The method allows to identify patterns of faults occurring in a cluster of bindings; in this way the energy consumption can be further optimized also through the building management staff by informing occupants of their energy usage and educating them to be proactive in their energy consumption. Finally, in the context of smart buildings, the common detected outliers in the cluster of buildings demonstrate that the management of a smart district can be operated with the whole buildings cluster approach. }
}
@article{Sunikka201210049,
  title = {Applying text-mining to personalization and customization research literature – Who, what and where? },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {11},
  pages = {10049 - 10058},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.02.042},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412002862},
  author = {Anne Sunikka and Johanna Bragge},
  keywords = {Personalization},
  keywords = {Customization},
  keywords = {Mass customization},
  keywords = {Text-mining},
  keywords = {Research profiling},
  keywords = {Literature review },
  abstract = {Personalization and customization have numerous definitions that are sometimes used interchangeably in the literature. This study combines a text-mining approach for profiling personalization and customization research with a traditional literature review in order to distinguish the main characteristics of these two research streams. Research profiling with search words personalization and customization is conducted using the Web of Science literature database. The elements typical to the personalization and customization research are identified. Personalization research has a strong focus on technology and the internet; in addition to which it emphasizes customers’ needs and preferences as well as information collection for user modeling and recommender systems. Customization is an older research stream, and the main body of the research has focused on tangible products but has lately initiated research in service fields. Based on the insights gained from research profiling and literature review, this study suggests a new classification of concepts linked to personalization. }
}
@article{Zolbanin2015150,
  title = {Predicting overall survivability in comorbidity of cancers: A data mining approach },
  journal = {Decision Support Systems },
  volume = {74},
  number = {0},
  pages = {150 - 161},
  year = {2015},
  note = {},
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/j.dss.2015.04.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0167923615000743},
  author = {Hamed Majidi Zolbanin and Dursun Delen and Amir Hassan Zadeh},
  keywords = {Medical decision making},
  keywords = {Comorbidity},
  keywords = {Concurrent diseases},
  keywords = {Concomitant diseases},
  keywords = {Predictive modeling},
  keywords = {Random forest },
  abstract = {Abstract Cancer and other chronic diseases have constituted (and will do so at an increasing pace) a significant portion of healthcare costs in the United States in recent years. Although prior research has shown that diagnostic and treatment recommendations might be altered based on the severity of comorbidities, chronic diseases are still being investigated in isolation from one another in most cases. To illustrate the significance of concurrent chronic diseases in the course of treatment, this study uses SEER's cancer data to create two comorbid data sets: one for breast and female genital cancers and another for prostate and urinal cancers. Several popular machine learning techniques are then applied to the resultant data sets to build predictive models. Comparison of the results shows that having more information about comorbid conditions of patients can improve models' predictive power, which in turn, can help practitioners make better diagnostic and treatment decisions. Therefore, proper identification, recording, and use of patients' comorbidity status can potentially lower treatment costs and ease the healthcare related economic challenges. }
}
@article{Haalboom2012969,
  title = {The intersection of corporate social responsibility guidelines and indigenous rights: Examining neoliberal governance of a proposed mining project in Suriname },
  journal = {Geoforum },
  volume = {43},
  number = {5},
  pages = {969 - 979},
  year = {2012},
  note = {},
  issn = {0016-7185},
  doi = {http://dx.doi.org/10.1016/j.geoforum.2012.06.003},
  url = {http://www.sciencedirect.com/science/article/pii/S001671851200108X},
  author = {Bethany Haalboom},
  keywords = {Corporate social responsibility},
  keywords = {Indigenous rights},
  keywords = {Mining},
  keywords = {Suriname },
  abstract = {With neoliberal reforms and the growth of multinational mining investment in developing countries, corporate social responsibility (CSR) has become notable (and debatable) for its potential to fill a social and environmental governance gap. As yet, there has been limited analytical attention paid to the political struggles and power dynamics that get reflected through specific \{CSR\} guidelines and their implementation in local contexts; this is particularly apparent with respect to the human rights dimension of CSR, and more specifically, indigenous rights. This study documents the debates, issues of accountability, and different interpretations of \{CSR\} between \{NGOs\} representing indigenous rights and a mining corporation. These debates focus on environmental impact assessments; indigenous rights to land; and the indigenous right to Free, Prior, and Informed Consent. These exchanges illustrate the socio-political, as well as economic, positioning of these actors, and the different agendas associated with their positions that determine issues of accountability and shape alternate interpretations of \{CSR\} guidelines. The outcomes of these debates also reflect the different degrees of power that these actors hold in such contexts, irrespective of the strength or validity of their arguments about CSR. This dialogue is thereby a lens into the more complex and contentious entanglements that emerge with \{CSR\} as a mode of governance, as it plays out ‘on the ground.’ These findings also reinforce questions regarding what we can expect of \{CSR\} as a mode of governance for addressing human rights issues with resource extraction projects, particularly within the constraints of overriding political and social structures. }
}
@article{Tang20105172,
  title = {Blended metrics for novel sentence mining },
  journal = {Expert Systems with Applications },
  volume = {37},
  number = {7},
  pages = {5172 - 5177},
  year = {2010},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2009.12.075},
  url = {http://www.sciencedirect.com/science/article/pii/S095741740901121X},
  author = {Wenyin Tang and Flora S. Tsai and Lihui Chen},
  keywords = {Novel sentence mining},
  keywords = {Text mining},
  keywords = {Novelty detection},
  keywords = {Cosine similarity},
  keywords = {New word count},
  keywords = {Blended metric },
  abstract = {With the abundance of raw text documents available on the internet, many articles contain redundant information. Novel sentence mining can discover novel, yet relevant, sentences given a specific topic defined by a user. In real-time novelty mining, an important issue is to how to select a suitable novelty metric that quantitatively measures the novelty of a particular sentence. To utilize the merits of different metrics, a blended metric is proposed by combining both cosine similarity and new word count metrics. The blended metric has been tested on \{TREC\} 2003 and \{TREC\} 2004 Novelty Track data. The experimental results show that the blended metric can perform generally better on topics with different ratios of novelty, which is useful for real-time novelty mining in topics with varying degrees of novelty. }
}
@article{Lee20129534,
  title = {Design of convergent product concepts based on functionality: An association rule mining and decision tree approach },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {10},
  pages = {9534 - 9542},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.02.099},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412003624},
  author = {Changyong Lee and Bomi Song and Yongtae Park},
  keywords = {New product development (NPD)},
  keywords = {Convergent product},
  keywords = {Concept design},
  keywords = {Online community information},
  keywords = {Text mining},
  keywords = {Association rule mining (ARM)},
  keywords = {Decision tree (DT) },
  abstract = {Recent trends in paradigms of digital convergence have accentuated the notions of convergent products that are formed by adding new functions to an existing base product. However, a lacuna still remains in the literature as to systematic design of convergent product concepts (CPCs) based on functionality. This study proposes a systematic approach to design of \{CPCs\} based on online community information using data mining techniques. At the heart of the suggested approach is the combined use of association rule mining (ARM) and decision tree (DT) for discovering the significant relationships among items and detecting the meaningful conditions of items. Specifically, the proposed approach is composed of four steps: data collection and transformation, definition of target functions, identification of critical product features, and specification of design details. Three maps – function co-preferences map, feature relations map, and concepts specification map – are developed to aid decision making in design of CPCs, structuring and visualizing design implications. A case of the portable multimedia player (PMP) is presented to illustrate the proposed approach. We believe that our approach can reduce uncertainty and risk involved in the concept design stage. }
}
@article{Liang2012916,
  title = {Learning the “Whys”: Discovering design rationale using text mining — An algorithm perspective },
  journal = {Computer-Aided Design },
  volume = {44},
  number = {10},
  pages = {916 - 930},
  year = {2012},
  note = {Fundamentals of Next Generation CAD/E Systems },
  issn = {0010-4485},
  doi = {http://dx.doi.org/10.1016/j.cad.2011.08.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0010448511001904},
  author = {Yan Liang and Ying Liu and Chun Kit Kwong and Wing Bun Lee},
  keywords = {Design rationale},
  keywords = {Rationale representation},
  keywords = {Rationale discovery},
  keywords = {Text Mining},
  keywords = {Patent mining },
  abstract = {Collecting design rationale (DR) and making it available in a well-organized manner will better support product design, innovation and decision-making. Many \{DR\} systems have been developed to capture \{DR\} since the 1970s. However, the \{DR\} capture process is heavily human involved. In addition, with the increasing amount of \{DR\} available in archived design documents, it has become an acute problem to research a new computational approach that is able to capture \{DR\} from free textual contents effectively. In our previous study, we have proposed an \{ISAL\} (issue, solution and artifact layer) model for \{DR\} representation. In this paper, we focus on algorithm design to discover \{DR\} from design documents according to the \{ISAL\} modeling. For the issue layer of the \{ISAL\} model, we define a semantic sentence graph to model sentence relationships through language patterns. Based on this graph, we improve the manifold-ranking algorithm to extract issue-bearing sentences. To discover solution–reason bearing sentences for the solution layer, we propose building up two sentence graphs based on candidate solution-bearing sentences and reason-bearing sentences respectively, and propagating information between them. For artifact information extraction, we propose two term relations, i.e. positional term relation and mutual term relation. Using these relations, we extend our document profile model to score the candidate terms. The performance and scalability of the algorithms proposed are tested using patents as research data joined with an example of prior art search to illustrate its application prospects. }
}
@article{He2012441,
  title = {Screening for posttraumatic stress disorder using verbal features in self narratives: A text mining approach },
  journal = {Psychiatry Research },
  volume = {198},
  number = {3},
  pages = {441 - 447},
  year = {2012},
  note = {},
  issn = {0165-1781},
  doi = {http://dx.doi.org/10.1016/j.psychres.2012.01.032},
  url = {http://www.sciencedirect.com/science/article/pii/S0165178112000625},
  author = {Qiwei He and Bernard P. Veldkamp and Theo de Vries},
  keywords = {Posttraumatic stress disorder},
  keywords = {Text mining},
  keywords = {Self narratives},
  keywords = {Text classification},
  keywords = {Screening },
  abstract = {Much evidence has shown that people's physical and mental health can be predicted by the words they use. However, such verbal information is seldom used in the screening and diagnosis process probably because the procedure to handle these words is rather difficult with traditional quantitative methods. The first challenge would be to extract robust information from diversified expression patterns, the second to transform unstructured text into a structuralized dataset. The present study developed a new textual assessment method to screen the posttraumatic stress disorder (PTSD) patients using lexical features in the self narratives with text mining techniques. Using 300 self narratives collected online, we extracted highly discriminative keywords with the Chi-square algorithm and constructed a textual assessment model to classify individuals with the presence or absence of PTSD. This resulted in a high agreement between computer and psychiatrists' diagnoses for \{PTSD\} and revealed some expressive characteristics in the writings of \{PTSD\} patients. Although the results of text analysis are not completely analogous to the results of structured interviews in \{PTSD\} diagnosis, the application of text mining is a promising addition to assessing \{PTSD\} in clinical and research settings. }
}
@article{Shang20122195,
  title = {Using Pig as a data preparation language for large-scale mining software repositories studies: An experience report },
  journal = {Journal of Systems and Software },
  volume = {85},
  number = {10},
  pages = {2195 - 2204},
  year = {2012},
  note = {Automated Software Evolution },
  issn = {0164-1212},
  doi = {http://dx.doi.org/10.1016/j.jss.2011.07.034},
  url = {http://www.sciencedirect.com/science/article/pii/S0164121211002007},
  author = {Weiyi Shang and Bram Adams and Ahmed E. Hassan},
  keywords = {Software engineering},
  keywords = {Mining Software Repositories},
  keywords = {Pig},
  keywords = {MapReduce },
  abstract = {The Mining Software Repositories (MSR) field analyzes software repository data to uncover knowledge and assist development of ever growing, complex systems. However, existing approaches and platforms for \{MSR\} analysis face many challenges when performing large-scale \{MSR\} studies. Such approaches and platforms rarely scale easily out of the box. Instead, they often require custom scaling tricks and designs that are costly to maintain and that are not reusable for other types of analysis. We believe that the web community has faced many of these software engineering scaling challenges before, as web analyses have to cope with the enormous growth of web data. In this paper, we report on our experience in using a web-scale platform (i.e., Pig) as a data preparation language to aid large-scale \{MSR\} studies. Through three case studies, we carefully validate the use of this web platform to prepare (i.e., Extract, Transform, and Load, ETL) data for further analysis. Despite several limitations, we still encourage \{MSR\} researchers to leverage Pig in their large-scale studies because of Pig's scalability and flexibility. Our experience report will help other researchers who want to scale their analyses. }
}
@article{Schoor20121321,
  title = {Exploring regulatory processes during a computer-supported collaborative learning task using process mining },
  journal = {Computers in Human Behavior },
  volume = {28},
  number = {4},
  pages = {1321 - 1331},
  year = {2012},
  note = {},
  issn = {0747-5632},
  doi = {http://dx.doi.org/10.1016/j.chb.2012.02.016},
  url = {http://www.sciencedirect.com/science/article/pii/S0747563212000581},
  author = {Cornelia Schoor and Maria Bannert},
  keywords = {Computer-supported collaborative learning},
  keywords = {Social regulation},
  keywords = {Research methods},
  keywords = {Self-regulated learning},
  keywords = {Process mining },
  abstract = {The purpose of this study was to explore sequences of social regulatory processes during a computer-supported collaborative learning task and their relationship to group performance. Analogous to self-regulation during individual learning, we conceptualized social regulation both as individual and as collaborative activities of analyzing, planning, monitoring and evaluating cognitive and motivational aspects during collaborative learning. We analyzed the data of 42 participants working together in dyads. They had 90 min to develop a common handout on a statistical topic while communicating only via chat and common editor. The log files of chat and editor were coded regarding activities of social regulation. Results show that participants in dyads with higher group performance (N = 20) did not differ from participants with lower group performance (N = 22) in the frequencies of regulatory activities. In an exploratory way, we used process mining to identify process patterns for high versus low group performance dyads. The resulting models show clear parallels between high and low achieving dyads in a double loop of working on the task, monitoring, and coordinating. Moreover, there are no major differences in the process of high versus low achieving dyads. Both results are discussed with regard to theoretical and empirical issues. Furthermore, the method of process mining is discussed. }
}
@article{Stevanovic20128707,
  title = {Feature evaluation for web crawler detection with data mining techniques },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {10},
  pages = {8707 - 8717},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.01.210},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412002382},
  author = {Dusan Stevanovic and Aijun An and Natalija Vlajic},
  keywords = {Web crawler detection},
  keywords = {Web server access logs},
  keywords = {Data mining},
  keywords = {Classification},
  keywords = {\{DDoS\}},
  keywords = {\{WEKA\} },
  abstract = {Distributed Denial of Service (DDoS) is one of the most damaging attacks on the Internet security today. Recently, malicious web crawlers have been used to execute automated \{DDoS\} attacks on web sites across the WWW. In this study we examine the effect of applying seven well-established data mining classification algorithms on static web server access logs in order to: (1) classify user sessions as belonging to either automated web crawlers or human visitors and (2) identify which of the automated web crawlers sessions exhibit ‘malicious’ behavior and are potentially participants in a \{DDoS\} attack. The classification performance is evaluated in terms of classification accuracy, recall, precision and \{F1\} score. Seven out of nine vector (i.e. web-session) features employed in our work are borrowed from earlier studies on classification of user sessions as belonging to web crawlers. However, we also introduce two novel web-session features: the consecutive sequential request ratio and standard deviation of page request depth. The effectiveness of the new features is evaluated in terms of the information gain and gain ratio metrics. The experimental results demonstrate the potential of the new features to improve the accuracy of data mining classifiers in identifying malicious and well-behaved web crawler sessions. }
}
@article{Rong2012731,
  title = {A behavioral analysis of web sharers and browsers in Hong Kong using targeted association rule mining },
  journal = {Tourism Management },
  volume = {33},
  number = {4},
  pages = {731 - 740},
  year = {2012},
  note = {},
  issn = {0261-5177},
  doi = {http://dx.doi.org/10.1016/j.tourman.2011.08.006},
  url = {http://www.sciencedirect.com/science/article/pii/S0261517711001592},
  author = {Jia Rong and Huy Quan Vu and Rob Law and Gang Li},
  keywords = {Sharers},
  keywords = {Browsers},
  keywords = {Electronic word-of-mouth},
  keywords = {Association rules},
  keywords = {Machine learning},
  keywords = {Data mining},
  keywords = {Hong Kong},
  keywords = {Outbound tourism },
  abstract = {With the widespread use of Internet technology, electronic word-of-mouth [eWOM] communication through online reviews of products and services has a strong influence on consumer behavior and preferences. Although prior research efforts have attempted to investigate the behavior of users regarding the sharing of personal experiences and browsing the experiences of others online, it remains a challenge for business managers to incorporate eWOM effects into their business planning and decision-making processes effectively. Applying a newly proposed association rule mining technique, this study investigates eWOM in the context of the tourism industry using an outbound domestic tourism data set that was recently collected in Hong Kong. The complete profiles and the relations of online experience sharers and travel website browsers are explored. The empirical results are useful in helping tourism managers to define new target customers and to plan more effective marketing strategies. }
}
@article{Sobkowicz2012470,
  title = {Opinion mining in social media: Modeling, simulating, and forecasting political opinions in the web },
  journal = {Government Information Quarterly },
  volume = {29},
  number = {4},
  pages = {470 - 479},
  year = {2012},
  note = {Social Media in Government - Selections from the 12th Annual International Conference on Digital Government Research (dg.o2011) },
  issn = {0740-624X},
  doi = {http://dx.doi.org/10.1016/j.giq.2012.06.005},
  url = {http://www.sciencedirect.com/science/article/pii/S0740624X12000901},
  author = {Pawel Sobkowicz and Michael Kaschesky and Guillaume Bouchard},
  keywords = {Management},
  keywords = {Measurement},
  keywords = {Design},
  keywords = {Experimentation},
  keywords = {Opinion mining},
  keywords = {Social media},
  keywords = {Policy modeling },
  abstract = {Affordable and ubiquitous online communications (social media) provide the means for flows of ideas and opinions and play an increasing role for the transformation and cohesion of society – yet little is understood about how online opinions emerge, diffuse, and gain momentum. To address this problem, an opinion formation framework based on content analysis of social media and sociophysical system modeling is proposed. Based on prior research and own projects, three building blocks of online opinion tracking and simulation are described: (1) automated topic, emotion and opinion detection in real-time, (2) information flow modeling and agent-based simulation, and (3) modeling of opinion networks, including special social and psychological circumstances, such as the influence of emotions, media and leaders, changing social networks etc. Finally, three application scenarios are presented to illustrate the framework and motivate further research. }
}
@article{Tsai20106968,
  title = {Mining top-k frequent closed itemsets over data streams using the sliding window model },
  journal = {Expert Systems with Applications },
  volume = {37},
  number = {10},
  pages = {6968 - 6973},
  year = {2010},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2010.03.023},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417410002046},
  author = {Pauray S.M. Tsai},
  keywords = {Data mining},
  keywords = {Data stream},
  keywords = {Association rule},
  keywords = {Frequent closed itemset},
  keywords = {Sliding window },
  abstract = {Association rule mining is an important research topic in the data mining community. There are two difficulties occurring in mining association rules. First, the user must specify a minimum support for mining. Typically it may require tuning the value of the minimum support many times before a set of useful association rules could be obtained. However, it is not easy for the user to find an appropriate minimum support. Secondly, there are usually a lot of frequent itemsets generated in the mining result. It will result in the generation of a large number of association rules, giving rise to difficulties of applications. In this paper, we consider mining top-k frequent closed itemsets from data streams using a sliding window technique. A single pass algorithm, called FCI_max, is developed for the generation of top-k frequent closed itemsets of length no more than max_l. Our method can efficiently resolve the mentioned two difficulties in association rule mining, which promotes the usability of the mining result in practice. }
}
@article{Şen20129468,
  title = {Predicting and analyzing secondary education placement-test scores: A data mining approach },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {10},
  pages = {9468 - 9476},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.02.112},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412003752},
  author = {Baha Şen and Emine Uçar and Dursun Delen},
  keywords = {Data mining},
  keywords = {Classification},
  keywords = {Prediction},
  keywords = {Sensitivity analysis},
  keywords = {\{SETS\} },
  abstract = {Understanding the factors that lead to success (or failure) of students at placement tests is an interesting and challenging problem. Since the centralized placement tests and future academic achievements are considered to be related concepts, analysis of the success factors behind placement tests may help understand and potentially improve academic achievement. In this study using a large and feature rich dataset from Secondary Education Transition System in Turkey we developed models to predict secondary education placement test results, and using sensitivity analysis on those prediction models we identified the most important predictors. The results showed that \{C5\} decision tree algorithm is the best predictor with 95% accuracy on hold-out sample, followed by support vector machines (with an accuracy of 91%) and artificial neural networks (with an accuracy of 89%). Logistic regression models came out to be the least accurate of the four with and overall accuracy of 82%. The sensitivity analysis revealed that previous test experience, whether a student has a scholarship, student’s number of siblings, previous years’ grade point average are among the most important predictors of the placement test scores. }
}
@article{Chang20122183,
  title = {An efficient algorithm of frequent \{XML\} query pattern mining for ebXML applications in e-commerce },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {2},
  pages = {2183 - 2193},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2011.07.011},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417411009730},
  author = {Tsui-Ping Chang and Shih-Ying Chen},
  keywords = {\{XML\} query pattern mining},
  keywords = {\{XML\} query},
  keywords = {Data mining},
  keywords = {ebXML},
  keywords = {E-commerce },
  abstract = {Providing efficient query to \{XML\} data for ebXML applications in e-commerce is crucial, as \{XML\} has become the most important technique to exchange data over the Internet. ebXML is a set of specifications for companies to exchange their data in e-commerce. Following the ebXML specifications, companies have a standard method to exchange business messages, communicate data, and business rules in e-commerce. Due to its tree-structure paradigm, \{XML\} is superior for its capability of storing and querying complex data for ebXML applications. Therefore, discovering frequent \{XML\} query patterns has become an interesting topic for \{XML\} data management in ebXML applications. In this paper, we present an efficient mining algorithm, namely ebXMiner, to discover the frequent \{XML\} query patterns for ebXML applications. Unlike the existing algorithms, we propose a new idea by collecting the equivalent \{XML\} queries and then enumerating the candidates from infrequent \{XML\} queries in our ebXMiner. Furthermore, our simulation results show that ebXMiner outperforms other algorithms in its execution time. }
}
@article{Javadi2012926,
  title = {Modelling stress–strain and volume change behaviour of unsaturated soils using an evolutionary based data mining technique, an incremental approach },
  journal = {Engineering Applications of Artificial Intelligence },
  volume = {25},
  number = {5},
  pages = {926 - 933},
  year = {2012},
  note = {},
  issn = {0952-1976},
  doi = {http://dx.doi.org/10.1016/j.engappai.2012.03.006},
  url = {http://www.sciencedirect.com/science/article/pii/S0952197612000620},
  author = {A.A. Javadi and A. Ahangar-Asr and A. Johari and A. Faramarzi and D. Toll},
  keywords = {Unsaturated soil},
  keywords = {Evolutionary computation},
  keywords = {Data mining },
  abstract = {Modelling of unsaturated soils has been the subject of many research works in the past few decades. A number of constitutive models have been developed to describe the complex behaviour of unsaturated soils. However, many have proven to be unable to predict all aspects of the behaviour of unsaturated soils in a unified manner. In this paper an alternative new approach is presented, based on the Evolutionary Polynomial Regression (EPR) technique. \{EPR\} is a data mining technique that generates a transparent and structured representation of the behaviour of a system directly from input test data. The capabilities of the proposed EPR-based framework in modelling of behaviour of unsaturated soils are illustrated using results from a comprehensive set of triaxial tests on samples of compacted unsaturated soils from literature. The main parameters used for modelling of the behaviour of unsaturated soils during shearing are initial water content, initial dry density, mean net stress, axial strain, suction, volumetric strain, and deviator stress. The model developed is used to predict different aspects of the behaviour of unsaturated soils for conditions not used in the model building process. The results show that the proposed approach provides a useful framework for modelling of unsaturated soils. The merits and advantages of the proposed approach are highlighted. }
}
@article{Rebuge201299,
  title = {Business process analysis in healthcare environments: A methodology based on process mining },
  journal = {Information Systems },
  volume = {37},
  number = {2},
  pages = {99 - 116},
  year = {2012},
  note = {Management and Engineering of Process-Aware Information Systems },
  issn = {0306-4379},
  doi = {http://dx.doi.org/10.1016/j.is.2011.01.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0306437911000044},
  author = {Álvaro Rebuge and Diogo R. Ferreira},
  keywords = {Business process analysis},
  keywords = {Healthcare processes},
  keywords = {Process mining},
  keywords = {Sequence clustering },
  abstract = {Performing business process analysis in healthcare organizations is particularly difficult due to the highly dynamic, complex, ad hoc, and multi-disciplinary nature of healthcare processes. Process mining is a promising approach to obtain a better understanding about those processes by analyzing event data recorded in healthcare information systems. However, not all process mining techniques perform well in capturing the complex and ad hoc nature of clinical workflows. In this work we introduce a methodology for the application of process mining techniques that leads to the identification of regular behavior, process variants, and exceptional medical cases. The approach is demonstrated in a case study conducted at a hospital emergency service. For this purpose, we implemented the methodology in a tool that integrates the main stages of process analysis. The tool is specific to the case study, but the same methodology can be used in other healthcare environments. }
}
@article{Demiriz2011284,
  title = {Re-mining item associations: Methodology and a case study in apparel retailing },
  journal = {Decision Support Systems },
  volume = {52},
  number = {1},
  pages = {284 - 293},
  year = {2011},
  note = {},
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/j.dss.2011.08.004},
  url = {http://www.sciencedirect.com/science/article/pii/S016792361100131X},
  author = {Ayhan Demiriz and Gürdal Ertek and Tankut Atan and Ufuk Kula},
  keywords = {Data mining},
  keywords = {Association mining},
  keywords = {Negative association},
  keywords = {Apparel retailing},
  keywords = {Inductive decision trees},
  keywords = {Retail data },
  abstract = {Association mining is the conventional data mining technique for analyzing market basket data and it reveals the positive and negative associations between items. While being an integral part of transaction data, pricing and time information have not been integrated into market basket analysis in earlier studies. This paper proposes a new approach to mine price, time and domain related attributes through re-mining of association mining results. The underlying factors behind positive and negative relationships can be characterized and described through this second data mining stage. The applicability of the methodology is demonstrated through the analysis of data coming from a large apparel retail chain, and its algorithmic complexity is analyzed in comparison to the existing techniques. }
}
@article{Suh2015115,
  title = {Forecasting the daily outbreak of topic-level political risk from social media using hidden Markov model-based techniques },
  journal = {Technological Forecasting and Social Change },
  volume = {94},
  number = {0},
  pages = {115 - 132},
  year = {2015},
  note = {},
  issn = {0040-1625},
  doi = {http://dx.doi.org/10.1016/j.techfore.2014.08.014},
  url = {http://www.sciencedirect.com/science/article/pii/S0040162514002534},
  author = {Jong Hwan Suh},
  keywords = {Political risk},
  keywords = {Social media},
  keywords = {Topic extraction},
  keywords = {Sentiment},
  keywords = {Social network},
  keywords = {Hidden Markov model},
  keywords = {Markov switching model },
  abstract = {Abstract Nowadays, as an arena of politics, social media ignites political protests, so analyzing topics discussed negatively in the social media has increased in importance for detecting a nation's political risk. In this context, this paper designs and examines an automatic approach to forecast the daily outbreak of political risk from social media at a topic level. It evaluates the forecasting performances of topic features, investigated among the previous works that analyze social media data for politics, hidden Markov model (HMM)-based techniques, widely used for the anomaly detection with time-series data, and detection models, into which the topic features and the detection techniques are combined. When applied to South Korea's Web forum, Daum Agora, statistical comparisons with the constraints of false positive rate of < 0.1 and timeliness of < 0 show that, for accuracy, social network-based feature and, for sensitivity, energy-based feature give the best results but there is no single best detection technique for accuracy and sensitivity. Besides, they demonstrate that the detection model using Markov switching model with jumps (MSJ) with social-network based feature is the best combination for accuracy; there is no single best detection model for sensitivity. This paper helps make a move to prevent the national political risk, and eventually the predictive governance benefits the people. }
}
@article{Venugopalan2015236,
  title = {Topic based classification and pattern identification in patents },
  journal = {Technological Forecasting and Social Change },
  volume = {94},
  number = {0},
  pages = {236 - 250},
  year = {2015},
  note = {},
  issn = {0040-1625},
  doi = {http://dx.doi.org/10.1016/j.techfore.2014.10.006},
  url = {http://www.sciencedirect.com/science/article/pii/S0040162514002923},
  author = {Subhashini Venugopalan and Varun Rai},
  keywords = {Patents},
  keywords = {Topic modeling},
  keywords = {Document classification},
  keywords = {Technology convergence},
  keywords = {Solar photovoltaic},
  keywords = {Knowledge flows },
  abstract = {Abstract Patent classification systems and citation networks are used extensively in innovation studies. However, non-unique mapping of classification codes onto specific products/markets and the difficulties in accurately capturing knowledge flows based just on citation linkages present limitations to these conventional patent analysis approaches. We present a natural language processing based hierarchical technique that enables the automatic identification and classification of patent datasets into technology areas and sub-areas. The key novelty of our technique is to use topic modeling to map patents to probability distributions over real world categories/topics. Accuracy and usefulness of our technique are tested on a dataset of 10,201 patents in solar photovoltaics filed in the United States Patent and Trademark Office (USPTO) between 2002 and 2013. We show that linguistic features from topic models can be used to effectively identify the main technology area that a patent's invention applies to. Our computational experiments support the view that the topic distribution of a patent offers a reduced-form representation of the knowledge content in a patent. Accordingly, we suggest that this hidden thematic structure in patents can be useful in studies of the policy–innovation–geography nexus. To that end, we also demonstrate an application of our technique for identifying patterns in technological convergence. }
}
@article{Deng20124453,
  title = {Fast mining erasable itemsets using NC_sets },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {4},
  pages = {4453 - 4463},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2011.09.143},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417411014667},
  author = {Zhi-Hong Deng and Xiao-Ran Xu},
  keywords = {Data mining},
  keywords = {Erasable itemsets},
  keywords = {NC_sets},
  keywords = {Algorithms},
  keywords = {Data structure },
  abstract = {Mining erasable itemsets first introduced in 2009 is one of new emerging data mining tasks. In this paper, we present a new data representation called NC_set, which keeps track of the complete information used for mining erasable itemsets. Based on NC_set, we propose a new algorithm called \{MERIT\} for mining erasable itemsets efficiently. The efficiency of \{MERIT\} is achieved with three techniques as follows. First, the NC_set is a compact structure, which prunes irrelevant data automatically. Second, the computation of the gain of an itemset is transformed into the combination of NC_sets, which can be completed in linear time complexity by an ingenious strategy. Third, \{MERIT\} can directly find erasable itemsets without generating candidate itemsets in some cases. For evaluating MERIT, we have conducted extensive experiments on a lot of synthetic product databases. Our performance study shows that the \{MERIT\} is efficient and is on average about two orders of magnitude faster than the META, the first algorithm for mining erasable itemsets. }
}
@article{Eirinaki20121175,
  title = {Feature-based opinion mining and ranking },
  journal = {Journal of Computer and System Sciences },
  volume = {78},
  number = {4},
  pages = {1175 - 1184},
  year = {2012},
  note = {},
  issn = {0022-0000},
  doi = {http://dx.doi.org/10.1016/j.jcss.2011.10.007},
  url = {http://www.sciencedirect.com/science/article/pii/S0022000011001139},
  author = {Magdalini Eirinaki and Shamita Pisal and Japinder Singh},
  keywords = {Opinion mining},
  keywords = {Feature ranking},
  keywords = {Sentiment analysis},
  keywords = {Semantic orientation},
  keywords = {Search engine },
  abstract = {The proliferation of blogs and social networks presents a new set of challenges and opportunities in the way information is searched and retrieved. Even though facts still play a very important role when information is sought on a topic, opinions have become increasingly important as well. Opinions expressed in blogs and social networks are playing an important role influencing everything from the products people buy to the presidential candidate they support. Thus, there is a need for a new type of search engine which will not only retrieve facts, but will also enable the retrieval of opinions. Such a search engine can be used in a number of diverse applications like product reviews to aggregating opinions on a political candidate or issue. Enterprises can also use such an engine to determine how users perceive their products and how they stand with respect to competition. This paper presents an algorithm which not only analyzes the overall sentiment of a document/review, but also identifies the semantic orientation of specific components of the review that lead to a particular sentiment. The algorithm is integrated in an opinion search engine which presents results to a query along with their overall tone and a summary of sentiments of the most important features. }
}
@article{Huang20121068,
  title = {Mining the change of customer behavior in fuzzy time-interval sequential patterns },
  journal = {Applied Soft Computing },
  volume = {12},
  number = {3},
  pages = {1068 - 1086},
  year = {2012},
  note = {},
  issn = {1568-4946},
  doi = {http://dx.doi.org/10.1016/j.asoc.2011.11.017},
  url = {http://www.sciencedirect.com/science/article/pii/S1568494611004595},
  author = {Tony Cheng-Kui Huang},
  keywords = {Data mining},
  keywords = {Change mining},
  keywords = {Fuzzy sets},
  keywords = {Sequential patterns},
  keywords = {Time-interval },
  abstract = {Comprehending changes of customer behavior is an essential problem that must be faced for survival in a fast-changing business environment. Particularly in the management of electronic commerce (EC), many companies have developed on-line shopping stores to serve customers and immediately collect buying logs in databases. This trend has led to the development of data-mining applications. Fuzzy time-interval sequential pattern mining is one type of serviceable data-mining technique that discovers customer behavioral patterns over time. To take a shopping example, (Bread, Short, Milk, Long, Jam), means that Bread is bought before Milk in a Short period, and Jam is bought after Milk in a Long period, where Short and Long are predetermined linguistic terms given by managers. This information shown in this example reveals more general and concise knowledge for managers, allowing them to make quick-response decisions, especially in business. However, no studies, to our knowledge, have yet to address the issue of changes in fuzzy time-interval sequential patterns. The fuzzy time-interval sequential pattern, (Bread, Short, Milk, Long, Jam), became available in last year; however, is not a trend this year, and has been substituted by (Bread, Short, Yogurt, Short, Jam). Without updating this knowledge, managers might map out inappropriate marketing plans for products or services and dated inventory strategies with respect to time-intervals. To deal with this problem, we propose a novel change mining model, MineFuzzChange, to detect the change in fuzzy time-interval sequential patterns. Using a brick-and-mortar transactional dataset collected from a retail chain in Taiwan and a \{B2C\} \{EC\} dataset, experiments are carried out to evaluate the proposed model. We empirically demonstrate how the model helps managers to understand the changing behaviors of their customers and to formulate timely marketing and inventory strategies. }
}
@article{Dejaeger2012548,
  title = {Gaining insight into student satisfaction using comprehensible data mining techniques },
  journal = {European Journal of Operational Research },
  volume = {218},
  number = {2},
  pages = {548 - 562},
  year = {2012},
  note = {},
  issn = {0377-2217},
  doi = {http://dx.doi.org/10.1016/j.ejor.2011.11.022},
  url = {http://www.sciencedirect.com/science/article/pii/S0377221711010137},
  author = {Karel Dejaeger and Frank Goethals and Antonio Giangreco and Lapo Mola and Bart Baesens},
  keywords = {Data mining},
  keywords = {Education evaluation},
  keywords = {Multi class classification},
  keywords = {Comprehensibility },
  abstract = {As a consequence of the heightened competition on the education market, the management of educational institutions often attempts to collect information on what drives student satisfaction by e.g. organizing large scale surveys amongst the student population. Until now, this source of potentially very valuable information remains largely untapped. In this study, we address this issue by investigating the applicability of different data mining techniques to identify the main drivers of student satisfaction in two business education institutions. In the end, the resulting models are to be used by the management to support the strategic decision making process. Hence, the aspect of model comprehensibility is considered to be at least equally important as model performance. It is found that data mining techniques are able to select a surprisingly small number of constructs that require attention in order to manage student satisfaction. }
}
@article{Szakonyi20151,
  title = {The KnownLeaf literature curation system captures knowledge about Arabidopsis leaf growth and development and facilitates integrated data mining },
  journal = {Current Plant Biology },
  volume = {2},
  number = {0},
  pages = {1 - 11},
  year = {2015},
  note = {},
  issn = {2214-6628},
  doi = {http://dx.doi.org/10.1016/j.cpb.2014.12.002},
  url = {http://www.sciencedirect.com/science/article/pii/S2214662815000031},
  author = {Dóra Szakonyi and Sofie Van Landeghem and Katja Baerenfaller and Lieven Baeyens and Jonas Blomme and Rubén Casanova-Sáez and Stefanie De Bodt and David Esteve-Bruna and Fabio Fiorani and Nathalie Gonzalez and Jesper Grønlund and Richard G.H. Immink and Sara Jover-Gil and Asuka Kuwabara and Tamara Muñoz-Nortes and Aalt D.J. van Dijk and David Wilson-Sánchez and Vicky Buchanan-Wollaston and Gerco C. Angenent and Yves Van de Peer and Dirk Inzé and José Luis Micol and Wilhelm Gruissem and Sean Walsh and Pierre Hilson},
  keywords = {Arabidopsis},
  keywords = {Leaf growth},
  keywords = {Literature curation},
  keywords = {Data integration },
  abstract = {Abstract The information that connects genotypes and phenotypes is essentially embedded in research articles written in natural language. To facilitate access to this knowledge, we constructed a framework for the curation of the scientific literature studying the molecular mechanisms that control leaf growth and development in Arabidopsis thaliana (Arabidopsis). Standard structured statements, called relations, were designed to capture diverse data types, including phenotypes and gene expression linked to genotype description, growth conditions, genetic and molecular interactions, and details about molecular entities. Relations were then annotated from the literature, defining the relevant terms according to standard biomedical ontologies. This curation process was supported by a dedicated graphical user interface, called Leaf Knowtator. A total of 283 primary research articles were curated by a community of annotators, yielding 9947 relations monitored for consistency and over 12,500 references to Arabidopsis genes. This information was converted into a relational database (KnownLeaf) and merged with other public Arabidopsis resources relative to transcriptional networks, protein–protein interaction, gene co-expression, and additional molecular annotations. Within KnownLeaf, leaf phenotype data can be searched together with molecular data originating either from this curation initiative or from external public resources. Finally, we built a network (LeafNet) with a portion of the KnownLeaf database content to graphically represent the leaf phenotype relations in a molecular context, offering an intuitive starting point for knowledge mining. Literature curation efforts such as ours provide high quality structured information accessible to computational analysis, and thereby to a wide range of applications. DATA: The presented work was performed in the framework of the AGRON-OMICS project (Arabidopsis \{GRO\} wth Network integrating \{OMICS\} technologies) supported by European Commission 6th Framework Programme project (Grant number LSHG-CT-2006-037704). This is a data integration and data sharing portal collecting all the all the major results from the consortium. All data presented in our paper is available here. https://agronomics.ethz.ch/. }
}
@article{Nohuddin2012104,
  title = {Finding “interesting” trends in social networks using frequent pattern mining and self organizing maps },
  journal = {Knowledge-Based Systems },
  volume = {29},
  number = {0},
  pages = {104 - 113},
  year = {2012},
  note = {Artificial Intelligence 2010 },
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2011.07.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705111001420},
  author = {Puteri N.E. Nohuddin and Frans Coenen and Rob Christley and Christian Setzkorn and Yogesh Patel and Shane Williams},
  keywords = {Trends},
  keywords = {Social networks},
  keywords = {Frequent pattern mining},
  keywords = {Self organizing maps},
  keywords = {Clustering },
  abstract = {This paper introduces a technique that uses frequent pattern mining and \{SOM\} techniques to identify, group and analyse trends in sequences of time stamped social networks so as to identify “interesting” trends. In this study, trends are defined in terms of a series of occurrence counts associated with frequent patterns that may be identified within social networks. Typically a large number of frequent patterns, and by extension a large number of trends, are discovered. Thus, to assist with the analysis of the discovered trends, the use of \{SOM\} techniques is advocated so that similar trends can be grouped together. To identify “interesting” trends a sequences of \{SOMs\} are generated which can be interpreted by considering how trends move from one \{SOM\} to the next. The further a trend moves from one \{SOM\} to the next, the more “interesting” the trend is deemed to be. The study is focused two types of network, Star networks and Complex star networks, exemplified by two real applications: the Cattle Tracing System in operation in Great Britain and a car insurance quotation application. }
}
@article{Zhao201533,
  title = {Topic-centric and semantic-aware retrieval system for internet of things },
  journal = {Information Fusion },
  volume = {23},
  number = {0},
  pages = {33 - 42},
  year = {2015},
  note = {},
  issn = {1566-2535},
  doi = {http://dx.doi.org/10.1016/j.inffus.2014.01.001},
  url = {http://www.sciencedirect.com/science/article/pii/S1566253514000062},
  author = {Feng Zhao and Zheng Sun and Hai Jin},
  keywords = {Internet of things},
  keywords = {Information retrieval},
  keywords = {Knowledge network},
  keywords = {Topic centric},
  keywords = {Semantic awareness },
  abstract = {Abstract The Internet of things (IoT) has been considered as one of the promising paradigms that can allow people and objects to seamlessly interact. So far, numerous applications and services have been proposed, such as retrieval service. The retrieval, however, faces a big challenge in IoT because the data belongs to different domains and user interaction with the surrounding environment is constrained. This paper proposes Acrost, a retrieval system based on topic discovery and semantic awareness in IoT environment. The initial contents with interesting information is obtained through the combination of two topic centric collectors. The metadata is extracted by aggregating regular expression-based and conditional random fields-based approaches. Moreover, the semantic-aware retrieval is achieved by parsing the query and ranking the relevance of contents. In addition, we present a case study on academic conference retrieval to validate the proposed approaches. Experimental results show that the proposed system can significantly improve the response time and efficiency of topic self-adaptive retrieval manner. }
}
@article{Koyuncugil20126238,
  title = {Financial early warning system model and data mining application for risk detection },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {6},
  pages = {6238 - 6253},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2011.12.021},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417411016927},
  author = {Ali Serhan Koyuncugil and Nermin Ozgulbas},
  keywords = {\{CHAID\}},
  keywords = {Data mining},
  keywords = {Early warning systems},
  keywords = {Financial risk},
  keywords = {Financial distress},
  keywords = {\{SMEs\} },
  abstract = {One of the biggest problems of \{SMEs\} is their tendencies to financial distress because of insufficient finance background. In this study, an early warning system (EWS) model based on data mining for financial risk detection is presented. \{CHAID\} algorithm has been used for development of the EWS. Developed \{EWS\} can be served like a tailor made financial advisor in decision making process of the firms with its automated nature to the ones who have inadequate financial background. Besides, an application of the model implemented which covered 7853 \{SMEs\} based on Turkish Central Bank (TCB) 2007 data. By using \{EWS\} model, 31 risk profiles, 15 risk indicators, 2 early warning signals, and 4 financial road maps has been determined for financial risk mitigation. }
}
@article{Leong20122584,
  title = {Mining sentiments in \{SMS\} texts for teaching evaluation },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {3},
  pages = {2584 - 2589},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2011.08.113},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417411012449},
  author = {Chee Kian Leong and Yew Haur Lee and Wai Keong Mak},
  keywords = {Sentiment mining},
  keywords = {\{SMS\} texts},
  keywords = {Education },
  abstract = {This paper explores the potential application of sentiment mining for analyzing short message service (SMS) texts in teaching evaluation. Data preparation involves the reading, parsing and categorization of the \{SMS\} texts. Three models were developed: the base model, the “corrected” model which adjusts for spelling errors and the “sentiment” model which extends the “corrected” model by performing sentiment mining. An “interestingness” criterion selects the “sentiment” model from which the sentiments of the students towards the lecture are discerned. Two types of incomplete \{SMS\} texts are also identified and the implications of their removal for the analysis ascertained. }
}
@article{Huang2014331,
  title = {Burst topic discovery and trend tracing based on Storm },
  journal = {Physica A: Statistical Mechanics and its Applications },
  volume = {416},
  number = {0},
  pages = {331 - 339},
  year = {2014},
  note = {},
  issn = {0378-4371},
  doi = {http://dx.doi.org/10.1016/j.physa.2014.08.059},
  url = {http://www.sciencedirect.com/science/article/pii/S0378437114007444},
  author = {Shihang Huang and Ying Liu and Depeng Dang},
  keywords = {Non-homogeneous Poisson process},
  keywords = {Storm},
  keywords = {Burst topic},
  keywords = {Trend },
  abstract = {Abstract With the rapid development of the Internet and the promotion of mobile Internet, microblogs have become a major source and route of transmission for public opinion, including burst topics that are caused by emergencies. To facilitate real time mining of a large range of burst topics, in this paper, we proposed a method to discover burst topics in real time and trace their trends based on the variation trends of word frequencies. First, for the variation trend of the words in microblogs, we adopt a non-homogeneous Poisson process model to fit the data. To represent the heat and trend of the words, we introduce heat degree factor and trend degree factor and realise the real time discovery and trend tracing of the burst topics based on these two factors. Second, to improve the computing performance, this paper was based on the Storm stream computing framework for real time computing. Finally, the experimental results indicate that by adjusting the observation window size and trend degree threshold, topics with different cycles and different burst strengths can be discovered. }
}
@article{Huang2012257,
  title = {An empirical investigation of factors influencing the adoption of data mining tools },
  journal = {International Journal of Information Management },
  volume = {32},
  number = {3},
  pages = {257 - 270},
  year = {2012},
  note = {},
  issn = {0268-4012},
  doi = {http://dx.doi.org/10.1016/j.ijinfomgt.2011.11.006},
  url = {http://www.sciencedirect.com/science/article/pii/S0268401211001319},
  author = {Tony Cheng-Kui Huang and Chuang-Chun Liu and Dong-Cheng Chang},
  keywords = {Technology acceptance model},
  keywords = {Behavioral intention},
  keywords = {Data mining tools},
  keywords = {Information systems },
  abstract = {Previous studies explored the adoption of various information technologies. However, there is little empirical research on factors influencing the adoption of data mining tools (DMTs), particularly at an individual level. This study investigates how users perceive and adopt \{DMTs\} to broaden practical knowledge for the business intelligence community. First, this study develops a theoretical model based on the Technology Acceptance Model 3, and then examines its perceived usefulness, perceived ease of use, and its ability to explain users’ intentions to use DMTs. The model's determinants include 4 categories: the task-oriented dimension (job relevance, output quality, result demonstrability, response time, and format), control beliefs (computer self-efficacy and perceptions of external control), emotion (computer anxiety), and intrinsic motivation (computer playfulness). This study also surveys the moderating effect of experience and output quality on the determinants of \{DMT\} adoption and use. An empirical study involving 206 \{DMT\} users was conducted to evaluate the model using structural equation modeling. Results demonstrate that the proposed model explains 58% of the variance. The findings of this study have interesting implications with respect to \{DMT\} adoption, both for researchers and practitioners. }
}
@article{Zhang2011667,
  title = {Multilingual sentence categorization and novelty mining },
  journal = {Information Processing & Management },
  volume = {47},
  number = {5},
  pages = {667 - 675},
  year = {2011},
  note = {Managing and Mining Multilingual Documents },
  issn = {0306-4573},
  doi = {http://dx.doi.org/10.1016/j.ipm.2010.02.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0306457310000178},
  author = {Yi Zhang and Flora S. Tsai and Agus Trisnajaya Kwee},
  keywords = {Multilingual categorization},
  keywords = {Sentence retrieval},
  keywords = {Novelty mining},
  keywords = {Malay},
  keywords = {Chinese },
  abstract = {A challenge for sentence categorization and novelty mining is to detect not only when text is relevant to the user’s information need, but also when it contains something new which the user has not seen before. It involves two tasks that need to be solved. The first is identifying relevant sentences (categorization) and the second is identifying new information from those relevant sentences (novelty mining). Many previous studies of relevant sentence retrieval and novelty mining have been conducted on the English language, but few papers have addressed the problem of multilingual sentence categorization and novelty mining. This is an important issue in global business environments, where mining knowledge from text in a single language is not sufficient. In this paper, we perform the first task by categorizing Malay and Chinese sentences, then comparing their performances with that of English. Thereafter, we conduct novelty mining to identify the sentences with new information. Experimental results on \{TREC\} 2004 Novelty Track data show similar categorization performance on Malay and English sentences, which greatly outperform Chinese. In the second task, it is observed that we can achieve similar novelty mining results for all three languages, which indicates that our algorithm is suitable for novelty mining of multilingual sentences. In addition, after benchmarking our results with novelty mining without categorization, it is learnt that categorization is necessary for the successful performance of novelty mining. }
}
@article{FournierViger201263,
  title = {CMRules: Mining sequential rules common to several sequences },
  journal = {Knowledge-Based Systems },
  volume = {25},
  number = {1},
  pages = {63 - 76},
  year = {2012},
  note = {Special Issue on New Trends in Data Mining },
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2011.07.005},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705111001456},
  author = {Philippe Fournier-Viger and Usef Faghihi and Roger Nkambou and Engelbert Mephu Nguifo},
  keywords = {Sequential rule mining},
  keywords = {Sequential pattern mining},
  keywords = {Association rule mining},
  keywords = {Sequence database},
  keywords = {Educational data mining },
  abstract = {Sequential rule mining is an important data mining task used in a wide range of applications. However, current algorithms for discovering sequential rules common to several sequences use very restrictive definitions of sequential rules, which make them unable to recognize that similar rules can describe a same phenomenon. This can have many undesirable effects such as (1) similar rules that are rated differently, (2) rules that are not found because they are considered uninteresting when taken individually, (3) and rules that are too specific, which makes them less likely to be used for making predictions. In this paper, we address these problems by proposing a more general form of sequential rules such that items in the antecedent and in the consequent of each rule are unordered. We propose an algorithm named \{CMRules\} for mining this form of rules. The algorithm proceeds by first finding association rules to prune the search space for items that occur jointly in many sequences. Then it eliminates association rules that do not meet the minimum confidence and support thresholds according to the sequential ordering. We evaluate the performance of \{CMRules\} in three different ways. First, we provide an analysis of its time complexity. Second, we compare its performance (in terms of execution time, memory usage and scalability) with an adaptation of an algorithm from the literature that we name CMDeo. For this comparison, we use three real-life public datasets, which have different characteristics and represent three kinds of data. In many cases, results show that \{CMRules\} is faster and has a better scalability for low support thresholds than CMDeo. Lastly, we report a successful application of the algorithm in a tutoring agent. }
}
@article{Tonkovich2012215,
  title = {Experimental observations of tyre deformation characteristics on heavy mining vehicles under static and quasi-static loading },
  journal = {Journal of Terramechanics },
  volume = {49},
  number = {3–4},
  pages = {215 - 231},
  year = {2012},
  note = {},
  issn = {0022-4898},
  doi = {http://dx.doi.org/10.1016/j.jterra.2012.05.005},
  url = {http://www.sciencedirect.com/science/article/pii/S0022489812000298},
  author = {Aleksander Tonkovich and Zhanbiao Li and Sante DiCecco and William Altenhof and Richard Banting and Henry Hu},
  keywords = {Heavy mining vehicle},
  keywords = {Experimental tyre deformation},
  keywords = {Off-the-road tyre},
  keywords = {Static tyre loading},
  keywords = {Quasi-static tyre excitation },
  abstract = {Due to large sidewall and bead thicknesses, multi-piece rims are necessary for use with large off-the-road (OTR) tyres. This paper presents the testing protocol and observed load/deflection and vertical/sidewall deflection characteristics of three Goodyear \{OTR\} tyre assemblies, namely, (1) a radial 29.5R29 (2) a bias-ply 29.5-29, and (3) a bias-ply 26.5-26. Localized tyre deformations and rim displacements were measured using optical displacement transducers and post-processing high-speed camera images using digital image analysis software. A validation analysis illustrated a maximum difference of 4.05% of vertical wheel displacements between the aforementioned methods. Quasi-static tests show the maximum values of vertical rim displacement and lateral tyre deflection are in the range of 72.2–78.9 mm and 23.3–27.1 mm, respectively, for a severe excitation condition. Differences ranging from 0.2% to 21.5% for maximum vertical and lateral tyre deflections were found between static load tests and engineering data provided by the tyre manufacturer. Linear relationships were observed for both vertical wheel displacement and lateral tyre deflection versus load for all tests. This study demonstrates a thorough methodology to study deflection characteristics of heavy duty \{OTR\} tyres and the collected data could be very useful in the development of numerical models of wheel and tyre assemblies for mining vehicles. }
}
@article{Khalifelu201265,
  title = {Comparison and evaluation of data mining techniques with algorithmic models in software cost estimation },
  journal = {Procedia Technology },
  volume = {1},
  number = {0},
  pages = {65 - 71},
  year = {2012},
  note = {First World Conference on Innovation and Computer Sciences (INSODE 2011) },
  issn = {2212-0173},
  doi = {http://dx.doi.org/10.1016/j.protcy.2012.02.013},
  url = {http://www.sciencedirect.com/science/article/pii/S221201731200014X},
  author = {Zeynab Abbasi Khalifelu and Farhad Soleimanian Gharehchopogh},
  keywords = {Software Cost Estimation},
  keywords = {Data Mining},
  keywords = {\{COCOMO\}},
  keywords = {Linear Regression},
  keywords = {Artificial Neural Network},
  keywords = {Support Vector Regression},
  keywords = {K-Nearest Neighbors },
  abstract = {Software Cost Estimation (SCE) is one of important topics in producing software in recent decades. Real estimation requires cost and effort factors in producing software by using of algorithmic or Artificial Intelligent (AI) techniques. Boehm developed the Constructive Cost Model (COCOMO) that is one of the algorithmic \{SCE\} models. Also, these models contain three increasingly basic, intermediate and detailed forms, i.e. basic \{COCOMO\} is suitable for quick, early, rough order of among the estimates of required effort in producing software, but its accuracy is limited due to its loss of factors to account for difference between cost drivers. Intermediate \{COCOMO\} assumes these project attributes into account. In addition detailed \{COCOMO\} accounts for individual project phases used. The \{COCOMO\} algorithmic techniques families have used since 1981. In recent years, some techniques emerged by using intelligent techniques to solve and estimate the effort required in producing software. In this paper, different data mining techniques to estimate software costs are presented and then the results of each technique are evaluated and compared. However, NASA's projects to train and test each of these techniques are applied. Then, data set to train and test the data mining techniques improve the estimation accuracy of the models in many cases. We show the comparison between \{COCOMO\} model and data mining techniques here. The results indicate that these methods result in many benefit answers. Also we show the comparison of the estimation accuracy of \{COCOMO\} model with data mining techniques. Data mining techniques improve the estimation accuracy of the models in many cases. So the estimated effort more improvement in this models. }
}
@incollection{McCue2015349,
  title = {Chapter 15 - Advanced Topics },
  editor = {McCue, Colleen },
  booktitle = {Data Mining and Predictive Analysis (Second Edition) },
  publisher = {Butterworth-Heinemann},
  edition = {Second Edition},
  address = {Boston},
  year = {2015},
  pages = {349 - 365},
  isbn = {978-0-12-800229-2},
  doi = {http://dx.doi.org/10.1016/B978-0-12-800229-2.00015-8},
  url = {http://www.sciencedirect.com/science/article/pii/B9780128002292000158},
  author = {Colleen McCue},
  keywords = {Expert options},
  keywords = {boosting},
  keywords = {data partitioning},
  keywords = {unstructured data},
  keywords = {automated feature extraction},
  keywords = {text mining tools},
  keywords = {social media},
  keywords = {social network analysis (SNA)},
  keywords = {fraud detection},
  keywords = {cyber },
  abstract = {Abstract Chapter 15 includes an overview of expert options, advanced topics, and areas of current development. Topics include the analysis of unstructured data, geospatial capabilities, social media exploitation, social network analysis (SNA), fraud, and cyber. In addition, applications to other domains and functional adjacencies are discussed. }
}
@article{Gopal2011727,
  title = {Information mining — Reflections on recent advancements and the road ahead in data, text, and media mining },
  journal = {Decision Support Systems },
  volume = {51},
  number = {4},
  pages = {727 - 731},
  year = {2011},
  note = {Recent Advances in Data, Text, and Media Mining & Information Issues in Supply Chain and in Service System Design },
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/j.dss.2011.01.008},
  url = {http://www.sciencedirect.com/science/article/pii/S0167923611000376},
  author = {Ram Gopal and James R. Marsden and Jan Vanthienen},
  keywords = {Data mining},
  keywords = {Text mining},
  keywords = {Media mining},
  keywords = {Information mining},
  keywords = {Opinion and sentiment analysis },
  abstract = {In this introduction, we briefly summarize the state of data and text mining today. Taking a very broad view, we use the term information mining to refer to the organization and analysis of structured or unstructured data that can be quantitative, textual, and/or pictorial in nature. The key question, in our view, is, “How can we transform data (in the very broad sense of this term) into ‘actionable knowledge’, knowledge that we can use in pursuit of a specified objective(s).” After detailing a set of key components of information mining, we introduce each of the papers in this volume and detail the focus of their contributions. }
}
@incollection{Hallinan201227,
  title = {Chapter 2 - Data mining for microbiologists },
  editor = {Colin Harwood and Anil Wipat},
  booktitle = {Systems Biology of Bacteria},
  publisher = {Academic Press},
  year = {2012},
  volume = {39},
  pages = {27 - 79},
  series = {Methods in Microbiology },
  issn = {0580-9517},
  doi = {http://dx.doi.org/10.1016/B978-0-08-099387-4.00002-8},
  url = {http://www.sciencedirect.com/science/article/pii/B9780080993874000028},
  author = {J.S. Hallinan},
  keywords = {Data mining},
  keywords = {Machine learning},
  keywords = {Knowledge discovery},
  keywords = {Classification},
  keywords = {Clustering },
  abstract = {Abstract The enormous amounts of molecular microbiological data currently produced by high-throughput analytical techniques pose both huge opportunities and huge challenges for microbiologists. With over 1000 databases online, it is clearly not feasible for researchers to manually search each one for information about the genes and processes in which they are interested. Much of the data stored in these databases never makes it into the peer-reviewed literature, and so becomes essentially unavailable in its entirety. A powerful approach to maximising the usefulness of large datasets, whether generated in-house or obtained from public repositories, is data integration and mining. Data integration is the process of bringing together large amounts of disparate data into a single, computationally accessible data source, while data mining is the process of finding hidden patterns and relationships in such large datasets. A wide range of algorithms is used for data mining, including established statistical methods, and approaches from the field of machine learning. The various algorithms available have different strengths and weaknesses, and are applicable to different types of data. In this review we first discuss the data mining life cycle and then describe some of the most widely used algorithms, illustrating their applications with examples from the microbiological literature. Where possible, we have identified freely available software for implementing these algorithms. }
}
@article{Kemp20121,
  title = {Corporate social responsibility, mining and “audit culture” },
  journal = {Journal of Cleaner Production },
  volume = {24},
  number = {0},
  pages = {1 - 10},
  year = {2012},
  note = {},
  issn = {0959-6526},
  doi = {http://dx.doi.org/10.1016/j.jclepro.2011.11.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0959652611004343},
  author = {Deanna Kemp and John R. Owen and Shashi van de Graaff},
  keywords = {Audit},
  keywords = {Accountability},
  keywords = {Corporate social responsibility},
  keywords = {Social performance},
  keywords = {Mining},
  keywords = {Risk },
  abstract = {This article engages internal organizational aspects of ‘accountability’ for corporate social responsibility (CSR) in mining by challenging the current ‘audit culture’. Audits offer a tool through which to shape and regulate corporate social performance (CSP). Where audits have limited value is in their ability to stimulate internal engagement around social and organizational norms and principles, as the process relies on auditors to generate performance data against pre-selected indicators. Data is then utilized to produce a measure of risk or effectiveness through which to demonstrate compliance. Focusing on the internal organizational aspects of accountability and the processes, mechanisms and methodologies used to establish critical reflection, three alternatives within the current audit regime are presented. These forms of ‘new accounting’ stand in contrast to conventional auditing, as their focus is on building cross-functional connections and collaborative internal relationships that are based on dialogue and mutual exchange about the problems and possibilities of \{CSR\} implementation. }
}
@article{Tsai201114094,
  title = {Experiments in term weighting for novelty mining },
  journal = {Expert Systems with Applications },
  volume = {38},
  number = {11},
  pages = {14094 - 14101},
  year = {2011},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2011.04.218},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417411007457},
  author = {Flora S. Tsai and Agus T. Kwee},
  keywords = {Novelty mining},
  keywords = {Novelty detection},
  keywords = {Term weighting},
  keywords = {Binary},
  keywords = {Term frequency},
  keywords = {Inverse document frequency},
  keywords = {Threshold},
  keywords = {Novelty dataset },
  abstract = {Obtaining new information in a short time is becoming crucial in today’s economy. A lot of information both offline or online is easily acquired, exacerbating the problem of information overload. Novelty mining detects documents/sentences that contain novel or new information and presents those results directly to users (Tang, Tsai, & Chen, 2010). Many methods and algorithms for novelty mining have previously been studied, but none have compared and discussed the impact of term weighting on the evaluation measures. This paper performed experiments to recommend the best term weighting function for both document and sentence-level novelty mining. }
}
@article{Verbeke2012211,
  title = {New insights into churn prediction in the telecommunication sector: A profit driven data mining approach },
  journal = {European Journal of Operational Research },
  volume = {218},
  number = {1},
  pages = {211 - 229},
  year = {2012},
  note = {},
  issn = {0377-2217},
  doi = {http://dx.doi.org/10.1016/j.ejor.2011.09.031},
  url = {http://www.sciencedirect.com/science/article/pii/S0377221711008599},
  author = {Wouter Verbeke and Karel Dejaeger and David Martens and Joon Hur and Bart Baesens},
  keywords = {Data mining},
  keywords = {Churn prediction},
  keywords = {Profit},
  keywords = {Input selection},
  keywords = {Oversampling},
  keywords = {Telecommunication sector },
  abstract = {Customer churn prediction models aim to indicate the customers with the highest propensity to attrite, allowing to improve the efficiency of customer retention campaigns and to reduce the costs associated with churn. Although cost reduction is their prime objective, churn prediction models are typically evaluated using statistically based performance measures, resulting in suboptimal model selection. Therefore, in the first part of this paper, a novel, profit centric performance measure is developed, by calculating the maximum profit that can be generated by including the optimal fraction of customers with the highest predicted probabilities to attrite in a retention campaign. The novel measure selects the optimal model and fraction of customers to include, yielding a significant increase in profits compared to statistical measures. In the second part an extensive benchmarking experiment is conducted, evaluating various classification techniques applied on eleven real-life data sets from telecom operators worldwide by using both the profit centric and statistically based performance measures. The experimental results show that a small number of variables suffices to predict churn with high accuracy, and that oversampling generally does not improve the performance significantly. Finally, a large group of classifiers is found to yield comparable performance. }
}
@article{Lin201115143,
  title = {Temporal data mining with up-to-date pattern trees },
  journal = {Expert Systems with Applications },
  volume = {38},
  number = {12},
  pages = {15143 - 15150},
  year = {2011},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2011.05.090},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417411008748},
  author = {Chun-Wei Lin and Tzung-Pei Hong},
  keywords = {Data mining},
  keywords = {Temporal data mining},
  keywords = {Up-to-date pattern},
  keywords = {UDP-tree},
  keywords = {UDP-growth },
  abstract = {Mining interesting and useful frequent patterns from large databases attracts much attention in recent years. Among the mining approaches, finding temporal patterns and regularities is very important due to its practicality. In the past, Hong et al. proposed the up-to-date patterns, which were frequent within their up-to-date lifetime. Formally, an up-to-date pattern is a pair with the itemset and its valid corresponding lifetime in which the user-defined minimum support threshold must be satisfied. They also proposed an Apriori-like approach to find the up-to-date patterns. This paper thus proposes the up-to-date pattern tree (UDP tree) to keep the up-to-date 1-patterns in a tree structure for reducing database scan. It is similar to the FP-tree structure but more complex due to the requirement of up-to-date patterns. The UDP-growth mining approach is also designed to find the up-to-date patterns from the \{UDP\} tree. The experimental results show that the proposed approach has a better performance than the level-wise mining algorithm. }
}
@article{Liu2012685,
  title = {Workflow simulation for operational decision support using event graph through process mining },
  journal = {Decision Support Systems },
  volume = {52},
  number = {3},
  pages = {685 - 697},
  year = {2012},
  note = {},
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/j.dss.2011.11.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0167923611002077},
  author = {Ying Liu and Hui Zhang and Chunping Li and Roger Jianxin Jiao},
  keywords = {Business process management},
  keywords = {Business process simulation},
  keywords = {Event graph},
  keywords = {Process mining },
  abstract = {It is increasingly common to see computer-based simulation being used as a vehicle to model and analyze business processes in relation to process management and improvement. While there are a number of business process management (BPM) and business process simulation (BPS) methodologies, approaches and tools available, it is more desirable to have a systemic \{BPS\} approach for operational decision support, from constructing process models based on historical data to simulating processes for typical and common problems. In this paper, we have proposed a generic approach of \{BPS\} for operational decision support which includes business processes modeling and workflow simulation with the models generated. Processes are modeled with event graphs through process mining from workflow logs that have integrated comprehensive information about the control-flow, data and resource aspects of a business process. A case study of a credit card application is presented to illustrate the steps involved in constructing an event graph. The evaluation detail is also given in terms of precision, generalization and robustness. Based on the event graph model constructed, we simulate the process under different scenarios and analyze the simulation logs for three generic problems in the case study: 1) suitable resource allocation plan for different case arrival rates; 2) teamwork performance under different case arrival rates; and 3) evaluation and prediction for personal performances. Our experimental results show that the proposed approach is able to model business processes using event graphs and simulate the processes for common operational decision support which collectively play an important role in process management and improvement. }
}
@article{Casaburi201519,
  title = {“Magic mirror in my hand, what is the sentiment in the lens?”: An action unit based approach for mining sentiments from multimedia contents },
  journal = {Journal of Visual Languages & Computing },
  volume = {27},
  number = {0},
  pages = {19 - 28},
  year = {2015},
  note = {Distributed Multimedia Systems \{DMS2014\} Part \{II\} },
  issn = {1045-926X},
  doi = {http://dx.doi.org/10.1016/j.jvlc.2015.01.001},
  url = {http://www.sciencedirect.com/science/article/pii/S1045926X15000026},
  author = {Luca Casaburi and Francesco Colace and Massimo De Santo and Luca Greco},
  keywords = {Affective computing},
  keywords = {Ekman theory},
  keywords = {Emotional intelligence},
  keywords = {Human computer interaction },
  abstract = {Abstract In psychology and philosophy, emotion is a subjective, conscious experience characterized primarily by psychophysiological expressions, biological reactions, and mental states. Emotion could be also considered as a “positive or negative experience” that is associated with a particular pattern of physiological activity. So, the extraction and recognition of emotions from multimedia contents is becoming one of the most challenging research topics in human–computer interaction. Facial expressions, posture, gestures, speech, emotive changes of physical parameters (e.g. body temperature, blush and changes in the tone of the voice) can reflect changes in the user׳s emotional state and all this kind of parameters can be detected and interpreted by a computer leading to the so-called “affective computing”. In this paper an approach for the extraction of emotions from images and videos will be introduced. In particular, it involves the adoption of action units׳ extraction from facial expression according to the Ekman theory. The proposed approach has been tested on standard and real datasets with interesting and promising results. }
}
@article{Boulila2011386,
  title = {A data mining based approach to predict spatiotemporal changes in satellite images },
  journal = {International Journal of Applied Earth Observation and Geoinformation },
  volume = {13},
  number = {3},
  pages = {386 - 395},
  year = {2011},
  note = {},
  issn = {0303-2434},
  doi = {http://dx.doi.org/10.1016/j.jag.2011.01.008},
  url = {http://www.sciencedirect.com/science/article/pii/S0303243411000250},
  author = {W. Boulila and I.R. Farah and K. Saheb Ettabaa and B. Solaiman and H. Ben Ghézala},
  keywords = {Remote sensing},
  keywords = {Knowledge discovery in satellite image databases},
  keywords = {Data mining},
  keywords = {Prediction},
  keywords = {Land cover change},
  keywords = {Classification},
  keywords = {Decision trees},
  keywords = {Fuzzy logic },
  abstract = {The interpretation of remotely sensed images in a spatiotemporal context is becoming a valuable research topic. However, the constant growth of data volume in remote sensing imaging makes reaching conclusions based on collected data a challenging task. Recently, data mining appears to be a promising research field leading to several interesting discoveries in various areas such as marketing, surveillance, fraud detection and scientific discovery. By integrating data mining and image interpretation techniques, accurate and relevant information (i.e. functional relation between observed parcels and a set of informational contents) can be automatically elicited. This study presents a new approach to predict spatiotemporal changes in satellite image databases. The proposed method exploits fuzzy sets and data mining concepts to build predictions and decisions for several remote sensing fields. It takes into account imperfections related to the spatiotemporal mining process in order to provide more accurate and reliable information about land cover changes in satellite images. The proposed approach is validated using \{SPOT\} images representing the Saint-Denis region, capital of Reunion Island. Results show good performances of the proposed framework in predicting change for the urban zone. }
}
@article{Velásquez20111532,
  title = {Extracting significant Website Key Objects: A Semantic Web mining approach },
  journal = {Engineering Applications of Artificial Intelligence },
  volume = {24},
  number = {8},
  pages = {1532 - 1541},
  year = {2011},
  note = {Semantic-based Information and Engineering Systems },
  issn = {0952-1976},
  doi = {http://dx.doi.org/10.1016/j.engappai.2011.02.001},
  url = {http://www.sciencedirect.com/science/article/pii/S095219761100025X},
  author = {Juan D. Velásquez and Luis E. Dujovne and Gaston L’Huillier},
  keywords = {Website Key Objects},
  keywords = {Web user preferences},
  keywords = {Web usage mining},
  keywords = {Semantic Web mining},
  keywords = {Self Organizing Feature Maps},
  keywords = {Ontology Engineering},
  keywords = {Web Intelligence},
  keywords = {Clickstream analysis},
  keywords = {Web personalization },
  abstract = {Web mining has been traditionally used in different application domains in order to enhance the content that Web users are accessing. Likewise, Website administrators are interested in finding new approaches to improve their Website content according to their users' preferences. Furthermore, the Semantic Web has been considered as an alternative to represent Web content in a way which can be used by intelligent techniques to provide the organization, meaning, and definition of Web content. In this work, we define the Website Key Object Extraction problem, whose solution is based on a Semantic Web mining approach to extract from a given Website core ontology, new relations between objects according to their Web user interests. This methodology was applied to a real Website, whose results showed that the automatic extraction of Key Objects is highly competitive against traditional surveys applied to Web users. }
}
@article{Guns20111951,
  title = {Itemset mining: A constraint programming perspective },
  journal = {Artificial Intelligence },
  volume = {175},
  number = {12–13},
  pages = {1951 - 1983},
  year = {2011},
  note = {},
  issn = {0004-3702},
  doi = {http://dx.doi.org/10.1016/j.artint.2011.05.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0004370211000646},
  author = {Tias Guns and Siegfried Nijssen and Luc De Raedt},
  keywords = {Data mining},
  keywords = {Itemset mining},
  keywords = {Constraint programming },
  abstract = {The field of data mining has become accustomed to specifying constraints on patterns of interest. A large number of systems and techniques has been developed for solving such constraint-based mining problems, especially for mining itemsets. The approach taken in the field of data mining contrasts with the constraint programming principles developed within the artificial intelligence community. While most data mining research focuses on algorithmic issues and aims at developing highly optimized and scalable implementations that are tailored towards specific tasks, constraint programming employs a more declarative approach. The emphasis lies on developing high-level modeling languages and general solvers that specify what the problem is, rather than outlining how a solution should be computed, yet are powerful enough to be used across a wide variety of applications and application domains. This paper contributes a declarative constraint programming approach to data mining. More specifically, we show that it is possible to employ off-the-shelf constraint programming techniques for modeling and solving a wide variety of constraint-based itemset mining tasks, such as frequent, closed, discriminative, and cost-based itemset mining. In particular, we develop a basic constraint programming model for specifying frequent itemsets and show that this model can easily be extended to realize the other settings. This contrasts with typical procedural data mining systems where the underlying procedures need to be modified in order to accommodate new types of constraint, or novel combinations thereof. Even though the performance of state-of-the-art data mining systems outperforms that of the constraint programming approach on some standard tasks, we also show that there exist problems where the constraint programming approach leads to significant performance improvements over state-of-the-art methods in data mining and as well as to new insights into the underlying data mining problems. Many such insights can be obtained by relating the underlying search algorithms of data mining and constraint programming systems to one another. We discuss a number of interesting new research questions and challenges raised by the declarative constraint programming approach to data mining. }
}
@article{Liu20121067,
  title = {Noisy data elimination using mutual k-nearest neighbor for classification mining },
  journal = {Journal of Systems and Software },
  volume = {85},
  number = {5},
  pages = {1067 - 1074},
  year = {2012},
  note = {},
  issn = {0164-1212},
  doi = {http://dx.doi.org/10.1016/j.jss.2011.12.019},
  url = {http://www.sciencedirect.com/science/article/pii/S0164121211003049},
  author = {Huawen Liu and Shichao Zhang},
  keywords = {Data mining},
  keywords = {Pattern classification},
  keywords = {kNN},
  keywords = {Mutual nearest neighbor},
  keywords = {Data reduction },
  abstract = {k nearest neighbor (kNN) is an effective and powerful lazy learning algorithm, notwithstanding its easy-to-implement. However, its performance heavily relies on the quality of training data. Due to many complex real-applications, noises coming from various possible sources are often prevalent in large scale databases. How to eliminate anomalies and improve the quality of data is still a challenge. To alleviate this problem, in this paper we propose a new anomaly removal and learning algorithm under the framework of kNN. The primary characteristic of our method is that the evidence of removing anomalies and predicting class labels of unseen instances is mutual nearest neighbors, rather than k nearest neighbors. The advantage is that pseudo nearest neighbors can be identified and will not be taken into account during the prediction process. Consequently, the final learning result is more creditable. An extensive comparative experimental analysis carried out on \{UCI\} datasets provided empirical evidence of the effectiveness of the proposed method for enhancing the performance of the k-NN rule. }
}
@article{Olson2012464,
  title = {Comparative analysis of data mining methods for bankruptcy prediction },
  journal = {Decision Support Systems },
  volume = {52},
  number = {2},
  pages = {464 - 473},
  year = {2012},
  note = {},
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/j.dss.2011.10.007},
  url = {http://www.sciencedirect.com/science/article/pii/S0167923611001709},
  author = {David L. Olson and Dursun Delen and Yanyan Meng},
  keywords = {Bankruptcy prediction},
  keywords = {Data mining},
  keywords = {Neural networks},
  keywords = {Decision trees},
  keywords = {Support vector machines},
  keywords = {Transparency},
  keywords = {Transportability },
  abstract = {A great deal of research has been devoted to prediction of bankruptcy, to include application of data mining. Neural networks, support vector machines, and other algorithms often fit data well, but because of lack of comprehensibility, they are considered black box technologies. Conversely, decision trees are more comprehensible by human users. However, sometimes far too many rules result in another form of incomprehensibility. The number of rules obtained from decision tree algorithms can be controlled to some degree through setting different minimum support levels. This study applies a variety of data mining tools to bankruptcy data, with the purpose of comparing accuracy and number of rules. For this data, decision trees were found to be relatively more accurate compared to neural networks and support vector machines, but there were more rule nodes than desired. Adjustment of minimum support yielded more tractable rule sets. }
}
@article{Yeh20104779,
  title = {\{HHUIF\} and MSICF: Novel algorithms for privacy preserving utility mining },
  journal = {Expert Systems with Applications },
  volume = {37},
  number = {7},
  pages = {4779 - 4786},
  year = {2010},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2009.12.038},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417409010847},
  author = {Jieh-Shan Yeh and Po-Chiang Hsu},
  keywords = {Privacy preserving},
  keywords = {Utility mining},
  keywords = {Data mining },
  abstract = {Privacy preserving data mining (PPDM) is a popular topic in the research community. How to strike a balance between privacy protection and knowledge discovery in the sharing process is an important issue. This study focuses on privacy preserving utility mining (PPUM) and presents two novel algorithms, \{HHUIF\} and MSICF, to achieve the goal of hiding sensitive itemsets so that the adversaries cannot mine them from the modified database. The work also minimizes the impact on the sanitized database of hiding sensitive itemsets. The experimental results show that \{HHUIF\} achieves lower miss costs than \{MSICF\} on two synthetic datasets. On the other hand, \{MSICF\} generally has a lower difference ratio than \{HHUIF\} between original and sanitized databases. }
}
@article{Tsai201111040,
  title = {Database optimization for novelty mining of business blogs },
  journal = {Expert Systems with Applications },
  volume = {38},
  number = {9},
  pages = {11040 - 11047},
  year = {2011},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2011.02.148},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417411003691},
  author = {Flora S. Tsai and Agus T. Kwee},
  keywords = {Novelty mining},
  keywords = {Novelty detection},
  keywords = {Business blogs},
  keywords = {Database optimization},
  keywords = {Weblog },
  abstract = {The widespread growth of business blogs has created opportunities for companies as channels of marketing, communication, customer feedback, and mass opinion measurement. However, many blogs often contain similar information and the sheer volume of available information really challenges the ability of organizations to act quickly in today’s business environment. Thus, novelty mining can help to single out novel information out of a massive set of text documents. This paper explores the feasibility and performance of novelty mining and database optimization of business blogs, which have not been studied before. The results show that our novelty mining system can detect novelty in our dataset of business blogs with very high accuracy, and that database optimization can significantly improve the performance. }
}
@article{Guo20115006,
  title = {Research on \{CBR\} system based on data mining },
  journal = {Applied Soft Computing },
  volume = {11},
  number = {8},
  pages = {5006 - 5014},
  year = {2011},
  note = {},
  issn = {1568-4946},
  doi = {http://dx.doi.org/10.1016/j.asoc.2011.05.057},
  url = {http://www.sciencedirect.com/science/article/pii/S1568494611002171},
  author = {Yuan Guo and Jie Hu and Yinghong Peng},
  keywords = {\{CBR\}},
  keywords = {Data mining},
  keywords = {\{ANN\}},
  keywords = {\{GHSOM\} },
  abstract = {Case based reasoning (CBR) is a popular problem solving methodology which solves a new problem by remembering previous similar situations and reusing knowledge from the solutions to these situations. Aiming at traditional \{CBR\} system's too much dependence upon experts or engineers, this paper introduces data mining technology into \{CBR\} system and \{GHSOM\} (Growing Hierarchical Self Organizing Map), an excellent data mining tool with \{ANN\} (artificial neural network) technology, is integrated with it. After principal features are selected from numerous initial features to represent a case, through \{GHSOM\} cases are organized and managed in case base and while case retrieval is conducted, new case is guided into corresponding sub-case base, which greatly raises system's accuracy and efficiency. At last, experiments are implemented to validate the effectiveness of the proposed methods by comparing the proposed methods with other recent researches. }
}
@article{McKee2011276,
  title = {Policy Matters Now and in the Future: Net Neutrality, Corporate Data Mining, and Government Surveillance },
  journal = {Computers and Composition },
  volume = {28},
  number = {4},
  pages = {276 - 291},
  year = {2011},
  note = {Composition 20/20: How the Future of the Web Could Sharpen the Teaching of Writing },
  issn = {8755-4615},
  doi = {http://dx.doi.org/10.1016/j.compcom.2011.09.001},
  url = {http://www.sciencedirect.com/science/article/pii/S8755461511000673},
  author = {Heidi A. McKee},
  keywords = {Data mining},
  keywords = {Future},
  keywords = {Internet},
  keywords = {Net neutrality},
  keywords = {Policy},
  keywords = {Privacy},
  keywords = {Regulations},
  keywords = {Surveillance },
  abstract = {In this article, I will detail three key policy issues that have a profound effect on the future of the World Wide Web and Internet-based communications: net neutrality, corporate data mining, and government surveillance. Focusing on policy issues in the U.S., I will describe not only current practices and cases, but future possibilities for writers and teachers of writing. I will draw from work in composition, interdisciplinary studies on privacy, information sharing, and surveillance on the Internet, analyses of applicable policies and laws, and the advocacy efforts by organizations. Issues I will examine include the importance of and threats to net neutrality; how data mining and (so-called) privacy agreements currently work, specifically at social networking sites often used in writing classrooms; and how government and institutional surveillance is far more prevalent than many realize. I will close with recommendations for what writing instructors (and students) can do to try to craft a different future, one where writers and the visual, verbal, aural writing they read and produce online will not be collected, scrutinized, and controlled (or, realistically, at least not as much). }
}
@article{Lu20151,
  title = {Detecting short-term cyclical topic dynamics in the user-generated content and news },
  journal = {Decision Support Systems },
  volume = {70},
  number = {0},
  pages = {1 - 14},
  year = {2015},
  note = {},
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/j.dss.2014.11.006},
  url = {http://www.sciencedirect.com/science/article/pii/S0167923614002735},
  author = {Hsin-Min Lu},
  keywords = {Topic models},
  keywords = {Gibbs sampling},
  keywords = {Temporal dynamics},
  keywords = {Context dependent},
  keywords = {Cyclical dynamics },
  abstract = {Abstract With the maturation of the Internet and the mobile technology, Internet users are now able to produce and consume text data in different contexts. Linking the context to the text data can provide valuable information regarding users' activities and preferences, which are useful for decision support tasks such as market segmentation and product recommendation. To this end, previous studies have proposed to incorporate into topic models contextual information such as authors' identities and timestamps. Despite recent efforts to incorporate contextual information, few studies have focused on the short-term cyclical topic dynamics that connect the changes in topic occurrences to the time of day, the day of the week, and the day of the month. Short-term cyclical topic dynamics can both characterize the typical contexts to which a user is exposed at different occasions and identify user habits in specific contexts. Both abilities are essential for decision support tasks that are context dependent. To address this challenge, we present the Probit-Dirichlet hybrid allocation (PDHA) topic model, which incorporates a document's temporal features to capture a topic's short-term cyclical dynamics. A document's temporal features enter the topic model through the regression covariates of a multinomial-Probit-like structure that influences the prior topic distribution of individual tokens. By incorporating temporal features for monthly, weekly, and daily cyclical dynamics, \{PDHA\} is able to capture interesting short-term cyclical patterns that characterize topic dynamics. We developed an augmented Gibbs sampling algorithm for the non-Dirichlet-conjugate setting in PDHA. We then demonstrated the utility of \{PDHA\} using text collections from user generated content, newswires, and newspapers. Our experiments show that \{PDHA\} achieves higher hold-out likelihood values compared to baseline models, including latent Dirichlet allocation (LDA) and Dirichlet-multinomial regression (DMR). The temporal features for short-term cyclical dynamics and the novel model structure of \{PDHA\} both contribute to this performance advantage. The results suggest that \{PDHA\} is an attractive approach for decision support tasks involving text mining. }
}
@article{Kovačević2012105,
  title = {Mining methodologies from \{NLP\} publications: A case study in automatic terminology recognition },
  journal = {Computer Speech & Language },
  volume = {26},
  number = {2},
  pages = {105 - 126},
  year = {2012},
  note = {},
  issn = {0885-2308},
  doi = {http://dx.doi.org/10.1016/j.csl.2011.09.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0885230811000465},
  author = {Aleksandar Kovačević and Zora Konjović and Branko Milosavljević and Goran Nenadic},
  keywords = {Information extraction},
  keywords = {Methodology mining},
  keywords = {Conditional Random Fields},
  keywords = {Automatic terminology mining },
  abstract = {The task of reviewing scientific publications and keeping up with the literature in a particular domain is extremely time-consuming. Extraction and exploration of methodological information, in particular, requires systematic understanding of the literature, but in many cases is performed within a limited context of publications that can be manually reviewed by an individual or group. Automated methodology identification could provide an opportunity for systematic retrieval of relevant documents and for exploring developments within a given discipline. In this paper we present a system for the identification of methodology mentions in scientific publications in the area of natural language processing, and in particular in automatic terminology recognition. The system comprises two major layers: the first layer is an automatic identification of methodological sentences; the second layer highlights methodological phrases (segments). Each mention is categorised in four semantic categories: Task, Method, Resource/Feature and Implementation. Extraction and classification of the segments is formalised as a sequence tagging problem and four separate phrase-based Conditional Random Fields are used to accomplish the task. The system has been evaluated on a manually annotated corpus comprising 45 full text articles. The results for the segment level annotation show an F-measure of 53% for identification of Task and Method mentions (with 70% precision), whereas the F-measures for Resource/Feature and Implementation identification were 61% (with 67% precision) and 75% (with 86% precision) respectively. At the document-level, an F-measure of 72% (with 81% precision) for Task mentions, 60% (with 81% precision) for Method mentions, 74% (with 78% precision) for the Resource/Feature and 79% (with 81% precision) for the Implementation categories have been achieved. We provide a detailed analysis of errors and explore the impact that the particular groups of features have on the extraction of methodological segments. }
}
@article{Mustapaşa20105820,
  title = {Implementation of Semantic Web Mining on E-Learning },
  journal = {Procedia - Social and Behavioral Sciences },
  volume = {2},
  number = {2},
  pages = {5820 - 5823},
  year = {2010},
  note = {Innovation and Creativity in Education },
  issn = {1877-0428},
  doi = {http://dx.doi.org/10.1016/j.sbspro.2010.03.949},
  url = {http://www.sciencedirect.com/science/article/pii/S1877042810009894},
  author = {Oğuz Mustapaşa and Dilek Karahoca and Adem Karahoca and Ahmet Yücel and Huseyin Uzunboylu},
  keywords = {Semantic web},
  keywords = {web mining},
  keywords = {e-learning},
  keywords = {distance learning},
  keywords = {personalization },
  abstract = {Semantic Web is a product of Web 2.0 (second generation of web) that is supported with automated semantic agents for processing user data to help the user on ease of use and personalization of services. Web Mining is an application of data mining which focuses on discovering patterns from Web logs and data. The semantic structure can be built with the pattern or relation results discovered via web mining. By combining those two applications of both disciplines, it's possible to achieve Semantic Web Mining which is a recent hot topic in educational research. This paper gives an overview of current applications of Semantic Web Mining on e-learning which already became a base component of education. }
}
@article{Kim2015367,
  title = {Link-topic model for biomedical abbreviation disambiguation },
  journal = {Journal of Biomedical Informatics },
  volume = {53},
  number = {0},
  pages = {367 - 380},
  year = {2015},
  note = {},
  issn = {1532-0464},
  doi = {http://dx.doi.org/10.1016/j.jbi.2014.12.013},
  url = {http://www.sciencedirect.com/science/article/pii/S1532046414002780},
  author = {Seonho Kim and Juntae Yoon},
  keywords = {Topic model},
  keywords = {Latent Dirichlet allocation},
  keywords = {Biomedical abbreviation disambiguation},
  keywords = {Semantic link},
  keywords = {Global abbreviation },
  abstract = {AbstractIntroduction The ambiguity of biomedical abbreviations is one of the challenges in biomedical text mining systems. In particular, the handling of term variants and abbreviations without nearby definitions is a critical issue. In this study, we adopt the concepts of topic of document and word link to disambiguate biomedical abbreviations. Methods We newly suggest the link topic model inspired by the latent Dirichlet allocation model, in which each document is perceived as a random mixture of topics, where each topic is characterized by a distribution over words. Thus, the most probable expansions with respect to abbreviations of a given abstract are determined by word-topic, document-topic, and word-link distributions estimated from a document collection through the link topic model. The model allows two distinct modes of word generation to incorporate semantic dependencies among words, particularly long form words of abbreviations and their sentential co-occurring words; a word can be generated either dependently on the long form of the abbreviation or independently. The semantic dependency between two words is defined as a link and a new random parameter for the link is assigned to each word as well as a topic parameter. Because the link status indicates whether the word constitutes a link with a given specific long form, it has the effect of determining whether a word forms a unigram or a skipping/consecutive bigram with respect to the long form. Furthermore, we place a constraint on the model so that a word has the same topic as a specific long form if it is generated in reference to the long form. Consequently, documents are generated from the two hidden parameters, i.e. topic and link, and the most probable expansion of a specific abbreviation is estimated from the parameters. Results Our model relaxes the bag-of-words assumption of the standard topic model in which the word order is neglected, and it captures a richer structure of text than does the standard topic model by considering unigrams and semantically associated bigrams simultaneously. The addition of semantic links improves the disambiguation accuracy without removing irrelevant contextual words and reduces the parameter space of massive skipping or consecutive bigrams. The link topic model achieves 98.42% disambiguation accuracy on 73,505 \{MEDLINE\} abstracts with respect to 21 three letter abbreviations and their 139 distinct long forms. }
}
@article{Ngai2011559,
  title = {The application of data mining techniques in financial fraud detection: A classification framework and an academic review of literature },
  journal = {Decision Support Systems },
  volume = {50},
  number = {3},
  pages = {559 - 569},
  year = {2011},
  note = {On quantitative methods for detection of financial fraud },
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/j.dss.2010.08.006},
  url = {http://www.sciencedirect.com/science/article/pii/S0167923610001302},
  author = {E.W.T. Ngai and Yong Hu and Y.H. Wong and Yijun Chen and Xin Sun},
  keywords = {Financial fraud},
  keywords = {Fraud detection},
  keywords = {Literature review},
  keywords = {Data mining},
  keywords = {Business intelligence },
  abstract = {This paper presents a review of — and classification scheme for — the literature on the application of data mining techniques for the detection of financial fraud. Although financial fraud detection (FFD) is an emerging topic of great importance, a comprehensive literature review of the subject has yet to be carried out. This paper thus represents the first systematic, identifiable and comprehensive academic literature review of the data mining techniques that have been applied to FFD. 49 journal articles on the subject published between 1997 and 2008 was analyzed and classified into four categories of financial fraud (bank fraud, insurance fraud, securities and commodities fraud, and other related financial fraud) and six classes of data mining techniques (classification, regression, clustering, prediction, outlier detection, and visualization). The findings of this review clearly show that data mining techniques have been applied most extensively to the detection of insurance fraud, although corporate fraud and credit card fraud have also attracted a great deal of attention in recent years. In contrast, we find a distinct lack of research on mortgage fraud, money laundering, and securities and commodities fraud. The main data mining techniques used for \{FFD\} are logistic models, neural networks, the Bayesian belief network, and decision trees, all of which provide primary solutions to the problems inherent in the detection and classification of fraudulent data. This paper also addresses the gaps between \{FFD\} and the needs of the industry to encourage additional research on neglected topics, and concludes with several suggestions for further \{FFD\} research. }
}
@article{Ventura201227,
  title = {Mining Concepts from Texts },
  journal = {Procedia Computer Science },
  volume = {9},
  number = {0},
  pages = {27 - 36},
  year = {2012},
  note = {Proceedings of the International Conference on Computational Science, \{ICCS\} 2012 },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2012.04.004},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050912001251},
  author = {João Ventura and Joaquim Silva},
  keywords = {Text Mining},
  keywords = {Term Extraction},
  keywords = {Relevant Expression},
  keywords = {Information Extraction},
  keywords = {Statistical Extractor },
  abstract = {The extraction of multi-word relevant expressions has been an increasingly hot topic in the last few years. Relevant expressions are applicable in diverse areas such as Information Retrieval, document clustering, or classification and indexing of documents. However, relevant single-words, which represent much of the knowledge in texts, have been a relatively dormant field. In this paper we present a statistical language-independent approach to extract concepts formed by relevant single and multi-word units. By achieving promising precision/recall values, it can be an alternative both to language dependent approaches and to extractors that deal exclusively with multi-words. }
}
@article{Lin20117419,
  title = {An effective tree structure for mining high utility itemsets },
  journal = {Expert Systems with Applications },
  volume = {38},
  number = {6},
  pages = {7419 - 7424},
  year = {2011},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2010.12.082},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417410014454},
  author = {Chun-Wei Lin and Tzung-Pei Hong and Wen-Hsiang Lu},
  keywords = {Utility mining},
  keywords = {High utility pattern},
  keywords = {HUP-tree},
  keywords = {HUP-growth},
  keywords = {Two-phase mining},
  keywords = {Downward closure },
  abstract = {In the past, many algorithms were proposed to mine association rules, most of which were based on item frequency values. Considering a customer may buy many copies of an item and each item may have different profits, mining frequent patterns from a traditional database is not suitable for some real-world applications. Utility mining was thus proposed to consider costs, profits and other measures according to user preference. In this paper, the high utility pattern tree (HUP tree) is designed and the HUP-growth mining algorithm is proposed to derive high utility patterns effectively and efficiently. The proposed approach integrates the previous two-phase procedure for utility mining and the FP-tree concept to utilize the downward-closure property and generate a compressed tree structure. Experimental results also show that the proposed approach has a better performance than Liu et al.’s two-phase algorithm in execution time. At last, the numbers of tree nodes generated from three different item ordering methods are also compared, with results showing that the frequency ordering produces less tree nodes than the other two. }
}
@article{Mohaghegh2011697,
  title = {Reservoir simulation and modeling based on artificial intelligence and data mining (AI&DM) },
  journal = {Journal of Natural Gas Science and Engineering },
  volume = {3},
  number = {6},
  pages = {697 - 705},
  year = {2011},
  note = {Artificial Intelligence and Data Mining },
  issn = {1875-5100},
  doi = {http://dx.doi.org/10.1016/j.jngse.2011.08.003},
  url = {http://www.sciencedirect.com/science/article/pii/S1875510011001090},
  author = {Shahab Dean Mohaghegh},
  keywords = {Reservoir Simulation and Modeling},
  keywords = {Artificial Intelligence and Data Mining},
  keywords = {Pattern Recognition in reservoir modeling},
  keywords = {Top-Down Modeling},
  keywords = {Surrogate Reservoir Modeling },
  abstract = {In this paper a new class of reservoir models that are developed based on the pattern recognition technologies collectively known as Artificial Intelligence and Data Mining (AI&DM) is introduced. The workflows developed based on this new class of reservoir simulation and modeling tools break new ground in modeling fluid flow through porous media by providing a completely new and different angle on reservoir simulation and modeling. The philosophy behind this modeling approach and its major commonalities and differences with numerical and analytical models are explored and two different categories of such models are explained. Details of this technology are presented using examples of most recent applications to several prolific reservoirs in the Middle East and in the Gulf of Mexico. AI-based Reservoir Models can be developed for green or brown fields. Since these models are developed based on spatio-temporal databases that are specifically developed for this purpose, they require the existence of a basic numerical reservoir simulator for the green fields while can be developed entirely based on historical data for brown fields. The run-time of AI-based Reservoir Models that provide complete field responses is measured in seconds rather than hours and days (even for a multi-million grid block reservoir). Therefore, providing means for fast track reservoir analysis and AI-assisted history matching are intrinsic characteristics of these models. AI-based Reservoir Models can, in some cases, completely substitute numerical reservoir simulation models, work side by side but completely independent or be integrated with them in order to increase their productivity. Advantages associated with AI-based Reservoir Models are short development time, low development cost, fast track analysis and practical capability to quantify the uncertainties associated with the static model. AI-based Reservoir Model includes a novel design tool for comprehensive analysis of the full field and design of field development strategies to meet operational targets. They have open data requirement architecture that can accommodate a wide variety of data from pressure tests to seismic. }
}
@article{AlZaidy2012147,
  title = {Mining criminal networks from unstructured text documents },
  journal = {Digital Investigation },
  volume = {8},
  number = {3–4},
  pages = {147 - 160},
  year = {2012},
  note = {},
  issn = {1742-2876},
  doi = {http://dx.doi.org/10.1016/j.diin.2011.12.001},
  url = {http://www.sciencedirect.com/science/article/pii/S1742287612000023},
  author = {Rabeah Al-Zaidy and Benjamin C.M. Fung and Amr M. Youssef and Francis Fortin},
  keywords = {Forensic analysis},
  keywords = {Data mining},
  keywords = {Hypothesis generation},
  keywords = {Criminal network},
  keywords = {Information retrieval },
  abstract = {Digital data collected for forensics analysis often contain valuable information about the suspects’ social networks. However, most collected records are in the form of unstructured textual data, such as e-mails, chat messages, and text documents. An investigator often has to manually extract the useful information from the text and then enter the important pieces into a structured database for further investigation by using various criminal network analysis tools. Obviously, this information extraction process is tedious and error-prone. Moreover, the quality of the analysis varies by the experience and expertise of the investigator. In this paper, we propose a systematic method to discover criminal networks from a collection of text documents obtained from a suspect’s machine, extract useful information for investigation, and then visualize the suspect’s criminal network. Furthermore, we present a hypothesis generation approach to identify potential indirect relationships among the members in the identified networks. We evaluated the effectiveness and performance of the method on a real-life cybercrimine case and some other datasets. The proposed method, together with the implemented software tool, has received positive feedback from the digital forensics team of a law enforcement unit in Canada. }
}
@article{D’hondt20113783,
  title = {Topic identification based on document coherence and spectral analysis },
  journal = {Information Sciences },
  volume = {181},
  number = {18},
  pages = {3783 - 3797},
  year = {2011},
  note = {},
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2011.04.044},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025511002258},
  author = {Joris D’hondt and Paul-Armand Verhaegen and Joris Vertommen and Dirk Cattrysse and Joost R. Duflou},
  keywords = {Topic identification},
  keywords = {Spectral theory},
  keywords = {Text mining },
  abstract = {In a world with vast information overload, well-optimized retrieval of relevant information has become increasingly important. Dividing large, multiple topic spanning documents into sets of coherent subdocuments facilitates the information retrieval process. This paper presents a novel technique to automatically subdivide a textual document into consistent components based on a coherence quantification function. This function is based on stem or term chains linking document entities, such as sentences or paragraphs, based on the reoccurrences of stems or terms. Applying this function on a document results in a coherence graph of the document linking its entities. Spectral graph partitioning techniques are used to divide this coherence graph into a number of subdocuments. A novel technique is introduced to obtain the most suitable number of subdocuments. These subdocuments are an aggregation of (not necessarily adjacent) entities. Performance tests are conducted in test environments based on standardized datasets to prove the algorithm’s capabilities. The relevance of these techniques for information retrieval and text mining is discussed. }
}
@article{Huang20119483,
  title = {Mining association rules to support resource allocation in business process management },
  journal = {Expert Systems with Applications },
  volume = {38},
  number = {8},
  pages = {9483 - 9490},
  year = {2011},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2011.01.146},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417411001795},
  author = {Zhengxing Huang and Xudong Lu and Huilong Duan},
  keywords = {Association rules},
  keywords = {Data mining},
  keywords = {Resource allocation},
  keywords = {Business process management },
  abstract = {Resource allocation is of great importance for business process management. In business process execution, a set of rules that specify resource allocation is always implied. Although many approaches have been offered to support resource allocation, they are not sufficient to derive interesting resource allocation rules which ensure that each activity is performed by suitable resource. Hence, this paper introduces an association rule mining based approach to mine interesting resource allocation rules from event log. The idea is to concern the ordered correlations between items in event log, and then to present two efficient algorithms to mine real “interesting” rules. The event log of radiology CT-scan examination process provided by the Chinese Huzhou hospital is used to verify the proposed approach. The evaluation results showed that the proposed approach not only is able to extract the rules more efficient and much faster, but also can discover more important resource allocation rules. }
}
@article{Köksal201113448,
  title = {A review of data mining applications for quality improvement in manufacturing industry },
  journal = {Expert Systems with Applications },
  volume = {38},
  number = {10},
  pages = {13448 - 13467},
  year = {2011},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2011.04.063},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417411005793},
  author = {Gülser Köksal and İnci Batmaz and Murat Caner Testik},
  keywords = {Knowledge discovery in databases},
  keywords = {Data mining},
  keywords = {Quality improvement},
  keywords = {Six sigma},
  keywords = {Design for six sigma},
  keywords = {Quality description},
  keywords = {Prediction},
  keywords = {Classification},
  keywords = {Parameter optimisation},
  keywords = {Data mining software},
  keywords = {Manufacturing },
  abstract = {Many quality improvement (QI) programs including six sigma, design for six sigma, and kaizen require collection and analysis of data to solve quality problems. Due to advances in data collection systems and analysis tools, data mining (DM) has widely been applied for \{QI\} in manufacturing. Although a few review papers have recently been published to discuss \{DM\} applications in manufacturing, these only cover a small portion of the applications for specific \{QI\} problems (quality tasks). In this study, an extensive review covering the literature from 1997 to 2007 and several analyses on selected quality tasks are provided on \{DM\} applications in the manufacturing industry. The quality tasks considered are; product/process quality description, predicting quality, classification of quality, and parameter optimisation. The review provides a comprehensive analysis of the literature from various points of view: data handling practices, \{DM\} applications for each quality task and for each manufacturing industry, patterns in the use of \{DM\} methods, application results, and software used in the applications are analysed. Several summary tables and figures are also provided along with the discussion of the analyses and results. Finally, conclusions and future research directions are presented. }
}
@article{Movafaghi2011191,
  title = {Sentiment Web Mining Architecture - Shahriar Movafaghi },
  journal = {Procedia - Social and Behavioral Sciences },
  volume = {26},
  number = {0},
  pages = {191 - 197},
  year = {2011},
  note = {The 2nd Collaborative Innovation Networks Conference - \{COINs2010\} },
  issn = {1877-0428},
  doi = {http://dx.doi.org/10.1016/j.sbspro.2011.10.575},
  url = {http://www.sciencedirect.com/science/article/pii/S1877042811024025},
  author = {Shahria Movafaghi and Jack Bullock},
  keywords = {Web Mining},
  keywords = {Software Architecture},
  keywords = {Sentiment },
  abstract = {In this paper we discuss the architecture of the system under development the purpose of which is to capture the sentiment of web users regarding any topic such as retail products, financial instruments (FI), or social issues like immigration. The first step is knowledge acquisition. A Sentiment Web Mining (SWM) system requires acquisition of knowledge from several sources on the web. Such knowledge may be found on blogs, social networks, email, or online news. A \{SWM\} system has customization and personalization capabilities. For our purposes, customization occurs when the \{SWM\} user can change his/her preferences to select specific sites to be used for data mining and evaluation. Personalization occurs when the system decides which sites to be used for data mining based on the user profile. The user profile dynamically changes depending on the type of user request from the system and the specific sites the user visits to verify the result of the \{SWM\} system. The second step is knowledge storage, which involves the creation of a database. Appropriate web sites will be indexed and tagged. Taxonomy is the hardest part of this step. In this paper we will demonstrate a unique way of tagging the knowledge obtained from the web. The third step is the knowledge analysis/data mining. A \{SWM\} system will use a series of off–the-shelf knowledge analysis/data mining tools including \{SWM\} knowledge analysis/data mining engine which is based on web services technology. The type of questions used can be: 1) The volume of sentiment for a particular topic; 2) The intensity of sentiment (good or bad) for a particular topic; 3) The interrelationship between the writers of material written on the web, especially if the writer is anonymous; 4) Who is/are the leader(s) of the sentiment? If the information is maliciously posted on the web the user may want to pursue it through legal means.The last step is dissemination of knowledge to the user(s). A \{SWM\} system uses third party visualization tools as well as web based user interfaces and reports that are written internally. The presentation component of a \{SWM\} system is decoupled from other components, namely, the process component, business rule component, and data access component for ease of maintainability. }
}
@incollection{Ambert2012109,
  title = {Chapter Six - Text-Mining and Neuroscience },
  editor = {Elissa J. Chesler and Melissa A. Haendel},
  booktitle = {Bioinformatics of Behavior: Part 1},
  publisher = {Academic Press},
  year = {2012},
  volume = {103},
  pages = {109 - 132},
  series = {International Review of Neurobiology },
  issn = {0074-7742},
  doi = {http://dx.doi.org/10.1016/B978-0-12-388408-4.00006-X},
  url = {http://www.sciencedirect.com/science/article/pii/B978012388408400006X},
  author = {Kyle H. Ambert and Aaron M. Cohen},
  keywords = {Neuroinformatics},
  keywords = {Neuroscience},
  keywords = {Text-mining},
  keywords = {Ontologies},
  keywords = {Information retrieval},
  keywords = {Document classification},
  keywords = {Machine learning },
  abstract = {Abstract The wealth and diversity of neuroscience research are inherent characteristics of the discipline that can give rise to some complications. As the field continues to expand, we generate a great deal of data about all aspects, and from multiple perspectives, of the brain, its chemistry, biology, and how these affect behavior. The vast majority of research scientists cannot afford to spend their time combing the literature to find every article related to their research, nor do they wish to spend time adjusting their neuroanatomical vocabulary to communicate with other subdomains in the neurosciences. As such, there has been a recent increase in the amount of informatics research devoted to developing digital resources for neuroscience research. Neuroinformatics is concerned with the development of computational tools to further our understanding of the brain and to make sense of the vast amount of information that neuroscientists generate (French & Pavlidis, 2007). Many of these tools are related to the use of textual data. Here, we review some of the recent developments for better using the vast amount of textual information generated in neuroscience research and publication and suggest several use cases that will demonstrate how bench neuroscientists can take advantage of the resources that are available. }
}
@article{Wang20117112,
  title = {Mining Web navigation patterns with a path traversal graph },
  journal = {Expert Systems with Applications },
  volume = {38},
  number = {6},
  pages = {7112 - 7122},
  year = {2011},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2010.12.058},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417410014211},
  author = {Yao-Te Wang and Anthony J.T. Lee},
  keywords = {Web log mining},
  keywords = {Path traversal graph},
  keywords = {Throughout-surfing pattern},
  keywords = {Browsing behaviour},
  keywords = {Web traversal pattern },
  abstract = {Understanding the navigational behaviour of website visitors is a significant factor of success in the emerging business models of electronic commerce and even mobile commerce. However, Web traversal patterns obtained by traditional Web usage mining approaches are ineffective for the content management of websites. They do not provide the big picture of the intentions of the visitors. The Web navigation patterns, termed throughout-surfing patterns (TSPs) as defined in this paper, are a superset of Web traversal patterns that effectively display the trends toward the next visited Web pages in a browsing session. \{TSPs\} are more expressive for understanding the purposes of website visitors. In this paper, we first introduce the concept of throughout-surfing patterns and then present an efficient method for mining the patterns. We propose a compact graph structure, termed a path traversal graph, to record information about the navigation paths of website visitors. The graph contains the frequent surfing paths that are required for mining TSPs. In addition, we devised a graph traverse algorithm based on the proposed graph structure to discover the TSPs. The experimental results show the proposed mining method is highly efficient to discover TSPs. }
}
@article{García201177,
  title = {A collaborative educational association rule mining tool },
  journal = {The Internet and Higher Education },
  volume = {14},
  number = {2},
  pages = {77 - 88},
  year = {2011},
  note = {Web mining and higher education: Introduction to the special issue },
  issn = {1096-7516},
  doi = {http://dx.doi.org/10.1016/j.iheduc.2010.07.006},
  url = {http://www.sciencedirect.com/science/article/pii/S1096751610000618},
  author = {Enrique García and Cristóbal Romero and Sebastián Ventura and Carlos de Castro},
  keywords = {Educational data mining tool},
  keywords = {Association rule mining},
  keywords = {Collaborative recommender system },
  abstract = {This paper describes a collaborative educational data mining tool based on association rule mining for the ongoing improvement of e-learning courses and allowing teachers with similar course profiles to share and score the discovered information. The mining tool is oriented to be used by non-expert instructors in data mining so its internal operation has to be transparent to the user and the instructor can focus on the analysis of the results and make decisions about how to improve the e-learning course. In this paper, a data mining tool is described in a tutorial way and some examples of rules discovered in an adaptive web-based course are shown and explained. }
}
@article{Taniya20121,
  title = {A prioritization analysis of disease association by data-mining of functional annotation of human genes },
  journal = {Genomics },
  volume = {99},
  number = {1},
  pages = {1 - 9},
  year = {2012},
  note = {},
  issn = {0888-7543},
  doi = {http://dx.doi.org/10.1016/j.ygeno.2011.10.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0888754311002230},
  author = {Takayuki Taniya and Susumu Tanaka and Yumi Yamaguchi-Kabata and Hideki Hanaoka and Chisato Yamasaki and Harutoshi Maekawa and Roberto A. Barrero and Boris Lenhard and Milton W. Datta and Mary Shimoyama and Roger Bumgarner and Ranajit Chakraborty and Ian Hopkinson and Libin Jia and Winston Hide and Charles Auffray and Shinsei Minoshima and Tadashi Imanishi and Takashi Gojobori},
  keywords = {Disease},
  keywords = {Rheumatoid arthritis},
  keywords = {Prostate cancer},
  keywords = {Data-mining},
  keywords = {Gene function},
  keywords = {Discriminant analysis },
  abstract = {Complex diseases result from contributions of multiple genes that act in concert through pathways. Here we present a method to prioritize novel candidates of disease-susceptibility genes depending on the biological similarities to the known disease-related genes. The extent of disease-susceptibility of a gene is prioritized by analyzing seven features of human genes captured in H-InvDB. Taking rheumatoid arthritis (RA) and prostate cancer (PC) as two examples, we evaluated the efficiency of our method. Highly scored genes obtained included \{TNFSF12\} and \{OSM\} as candidate disease genes for \{RA\} and PC, respectively. Subsequent characterization of these genes based upon an extensive literature survey reinforced the validity of these highly scored genes as possible disease-susceptibility genes. Our approach, Prioritization \{ANalysis\} of Disease Association (PANDA), is an efficient and cost-effective method to narrow down a large set of genes into smaller subsets that are most likely to be involved in the disease pathogenesis. }
}
@article{Lessmann201112826,
  title = {Tuning metaheuristics: A data mining based approach for particle swarm optimization },
  journal = {Expert Systems with Applications },
  volume = {38},
  number = {10},
  pages = {12826 - 12838},
  year = {2011},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2011.04.075},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417411005914},
  author = {Stefan Lessmann and Marco Caserta and Idel Montalvo Arango},
  keywords = {Metaheuristics},
  keywords = {Particle swarm optimization},
  keywords = {Forecasting},
  keywords = {Data mining },
  abstract = {The paper is concerned with practices for tuning the parameters of metaheuristics. Settings such as, e.g., the cooling factor in simulated annealing, may greatly affect a metaheuristic’s efficiency as well as effectiveness in solving a given decision problem. However, procedures for organizing parameter calibration are scarce and commonly limited to particular metaheuristics. We argue that the parameter selection task can appropriately be addressed by means of a data mining based approach. In particular, a hybrid system is devised, which employs regression models to learn suitable parameter values from past moves of a metaheuristic in an online fashion. In order to identify a suitable regression method and, more generally, to demonstrate the feasibility of the proposed approach, a case study of particle swarm optimization is conducted. Empirical results suggest that characteristics of the decision problem as well as search history data indeed embody information that allows suitable parameter values to be determined, and that this type of information can successfully be extracted by means of nonlinear regression models. }
}
@article{Hüllermeier20111493,
  title = {Fuzzy sets in machine learning and data mining },
  journal = {Applied Soft Computing },
  volume = {11},
  number = {2},
  pages = {1493 - 1505},
  year = {2011},
  note = {The Impact of Soft Computing for the Progress of Artificial Intelligence },
  issn = {1568-4946},
  doi = {http://dx.doi.org/10.1016/j.asoc.2008.01.004},
  url = {http://www.sciencedirect.com/science/article/pii/S1568494608000112},
  author = {Eyke Hüllermeier},
  keywords = {Fuzzy sets},
  keywords = {Machine learning},
  keywords = {Data mining},
  keywords = {Association rules},
  keywords = {Clustering},
  keywords = {Decision trees},
  keywords = {Feature extraction},
  keywords = {Gradual dependency },
  abstract = {Machine learning, data mining, and several related research areas are concerned with methods for the automated induction of models and the extraction of interesting patterns from empirical data. Automated knowledge acquisition of that kind has been an essential aspect of artificial intelligence since a long time and has more recently also attracted considerable attention in the fuzzy sets community. This paper briefly reviews some typical applications and highlights potential contributions that fuzzy set theory can make to machine learning, data mining, and related fields. In this connection, some advantages of fuzzy methods for representing and mining vague patterns in data are especially emphasized. }
}
@article{Raedel2015412,
  title = {Three-year outcomes of root canal treatment: Mining an insurance database },
  journal = {Journal of Dentistry },
  volume = {43},
  number = {4},
  pages = {412 - 417},
  year = {2015},
  note = {},
  issn = {0300-5712},
  doi = {http://dx.doi.org/10.1016/j.jdent.2015.01.013},
  url = {http://www.sciencedirect.com/science/article/pii/S0300571215000299},
  author = {Michael Raedel and Andrea Hartmann and Steffen Bohm and Michael H. Walter},
  keywords = {Health services research},
  keywords = {Endodontics},
  keywords = {Treatment outcome},
  keywords = {Public health},
  keywords = {Tooth (nonvital)},
  keywords = {Root canal therapy},
  keywords = {General practice (dental) },
  abstract = {AbstractObjectives There is doubt whether success rates of root canal treatments reported from clinical trials are achievable outside of standardized study populations. The aim of this study was to analyse the outcome of a large number of root canal treatments conducted in general practice. Methods The data was collected from the digital database of a major German national health insurance company. All teeth with complete treatment data were included. Only patients who had been insurance members for the whole 3-year period from 2010 to 2012 were eligible. Kaplan–Meier survival analyses were conducted based on completed root canal treatments. Target events were re-interventions as (1) retreatment of the root canal treatment, (2) apical root resection (apicoectomy) and (3) extraction. The influences of vitality status and root numbers on survival were tested with the log-rank test. Results A total of 556,067 root canal treatments were included. The cumulative overall survival rate for all target events combined was 84.3% for 3 years. The survival rate for nonvital teeth (82.6%) was significantly lower than for vital teeth (85.6%; p < 0.001). The survival rate for single rooted teeth (83.4%) was significantly lower than for multi-rooted teeth (85.5%; p < 0.001). The most frequent target event was extraction followed by apical root resection and retreatment. Conclusions Based on these 3-year outcomes, root canal treatment is considered a reliable treatment in practice routine under the conditions of the German national health insurance system. Clinical significance Root canal treatment can be considered as a reliable treatment option suitable to salvage most of the affected teeth. This statement applies to treatments that in the vast majority of cases were delivered by general practitioners under the terms and conditions of a nationwide health insurance system. }
}
@article{Dias2011107,
  title = {Whole field tendencies in transcranial magnetic stimulation: A systematic review with data and text mining },
  journal = {Asian Journal of Psychiatry },
  volume = {4},
  number = {2},
  pages = {107 - 112},
  year = {2011},
  note = {},
  issn = {1876-2018},
  doi = {http://dx.doi.org/10.1016/j.ajp.2011.03.003},
  url = {http://www.sciencedirect.com/science/article/pii/S1876201811000372},
  author = {Álvaro Machado Dias and Carlos Gustavo Mansur and Martin Myczkowski and Marco Marcolin},
  keywords = {\{TMS\}},
  keywords = {Neuropsychiatry},
  keywords = {Text mining},
  keywords = {Systematic review},
  keywords = {Depression },
  abstract = {Background Transcranial magnetic stimulation (TMS) has played an important role in the fields of psychiatry, neurology and neuroscience, since its emergence in the mid-1980s; and several high quality reviews have been produced since then. Most high quality reviews serve as powerful tools in the evaluation of predefined tendencies, but they cannot actually uncover new trends within the literature. However, special statistical procedures to ‘mine’ the literature have been developed which aid in achieving such a goal. Objectives This paper aims to uncover patterns within the literature on \{TMS\} as a whole, as well as specific trends in the recent literature on \{TMS\} for the treatment of depression. Methods Data mining and text mining. Results Currently there are 7299 publications, which can be clustered in four essential themes. Considering the frequency of the core psychiatric concepts within the indexed literature, the main results are: depression is present in 13.5% of the publications; Parkinson's disease in 2.94%; schizophrenia in 2.76%; bipolar disorder in 0.158%; and anxiety disorder in 0.142% of all the publications indexed in PubMed. Several other perspectives are discussed in the article. }
}
@article{Chen2011642,
  title = {Extracting hot spots of topics from time-stamped documents },
  journal = {Data & Knowledge Engineering },
  volume = {70},
  number = {7},
  pages = {642 - 660},
  year = {2011},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2011.03.009},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X11000449},
  author = {Wei Chen and Parvathi Chundi},
  keywords = {Scan statistic},
  keywords = {Text mining},
  keywords = {Hot spots},
  keywords = {Topics },
  abstract = {Identifying time periods with a burst of activities related to a topic has been an important problem in analyzing time-stamped documents. In this paper, we propose an approach to extract a hot spot of a given topic in a time-stamped document set. Topics can be basic, containing a simple list of keywords, or complex. Logical relationships such as and, or, and not are used to build complex topics from basic topics. A concept of presence measure of a topic based on fuzzy set theory is introduced to compute the amount of information related to the topic in the document set. Each interval in the time period of the document set is associated with a numeric value which we call the discrepancy score. A high discrepancy score indicates that the documents in the time interval are more focused on the topic than those outside of the time interval. A hot spot of a given topic is defined as a time interval with the highest discrepancy score. We first describe a naive implementation for extracting hot spots. We then construct an algorithm called \{EHE\} (Efficient Hot Spot Extraction) using several efficient strategies to improve performance. We also introduce the notion of a topic \{DAG\} to facilitate an efficient computation of presence measures of complex topics. The proposed approach is illustrated by several experiments on a subset of the TDT-Pilot Corpus and \{DBLP\} conference data set. The experiments show that the proposed \{EHE\} algorithm significantly outperforms the naive one, and the extracted hot spots of given topics are meaningful. }
}
@article{Dudas2011687,
  title = {A synergy of multi-objective optimization and data mining for the analysis of a flexible flow shop },
  journal = {Robotics and Computer-Integrated Manufacturing },
  volume = {27},
  number = {4},
  pages = {687 - 695},
  year = {2011},
  note = {Conference papers of Flexible Automation and Intelligent Manufacturing Intelligent manufacturing and services },
  issn = {0736-5845},
  doi = {http://dx.doi.org/10.1016/j.rcim.2010.12.005},
  url = {http://www.sciencedirect.com/science/article/pii/S0736584511000068},
  author = {Catarina Dudas and Marcus Frantzén and Amos H.C. Ng},
  keywords = {Data mining},
  keywords = {Decision trees},
  keywords = {Post-optimality analysis},
  keywords = {Simulation-based optimization },
  abstract = {A method for analyzing production systems by applying multi-objective optimization and data mining techniques on discrete-event simulation models, the so-called Simulation-based Innovization (SBI) is presented in this paper. The aim of the \{SBI\} analysis is to reveal insight on the parameters that affect the performance measures as well as to gain deeper understanding of the problem, through post-optimality analysis of the solutions acquired from multi-objective optimization. This paper provides empirical results from an industrial case study, carried out on an automotive machining line, in order to explain the \{SBI\} procedure. The \{SBI\} method has been found to be particularly suitable in this case study as the three objectives under study, namely total tardiness, makespan and average work-in-process, are in conflict with each other. Depending on the system load of the line, different decision variables have been found to be influencing. How the \{SBI\} method is used to find important patterns in the explored solution set and how it can be valuable to support decision making in order to improve the scheduling under different system loadings in the machining line are addressed. }
}
@article{Peng2011316,
  title = {An incident information management framework based on data integration, data mining, and multi-criteria decision making },
  journal = {Decision Support Systems },
  volume = {51},
  number = {2},
  pages = {316 - 327},
  year = {2011},
  note = {Multiple Criteria Decision Making and Decision Support Systems },
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/j.dss.2010.11.025},
  url = {http://www.sciencedirect.com/science/article/pii/S016792361000206X},
  author = {Yi Peng and Yong Zhang and Yu Tang and Shiming Li},
  keywords = {Incident information management},
  keywords = {Data integration},
  keywords = {Data mining},
  keywords = {Multiple criteria decision making},
  keywords = {Decision support system },
  abstract = {An effective incident information management system needs to deal with several challenges. It must support heterogeneous distributed incident data, allow decision makers (DMs) to detect anomalies and extract useful knowledge, assist \{DMs\} in evaluating the risks and selecting an appropriate alternative during an incident, and provide differentiated services to satisfy the requirements of different incident management phases. To address these challenges, this paper proposes an incident information management framework that consists of three major components. The first component is a high-level data integration module in which heterogeneous data sources are integrated and presented in a uniform format. The second component is a data mining module that uses data mining methods to identify useful patterns and presents a process to provide differentiated services for pre-incident and post-incident information management. The third component is a multi-criteria decision-making (MCDM) module that utilizes \{MCDM\} methods to assess the current situation, find the satisfactory solutions, and take appropriate responses in a timely manner. To validate the proposed framework, this paper conducts a case study on agrometeorological disasters that occurred in China between 1997 and 2001. The case study demonstrates that the combination of data mining and \{MCDM\} methods can provide objective and comprehensive assessments of incident risks. }
}
@article{Porter2011171,
  title = {Mining external R&D },
  journal = {Technovation },
  volume = {31},
  number = {4},
  pages = {171 - 176},
  year = {2011},
  note = {Managing Technology },
  issn = {0166-4972},
  doi = {http://dx.doi.org/10.1016/j.technovation.2011.01.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0166497211000113},
  author = {Alan L. Porter and Nils C. Newman},
  keywords = {Tech mining},
  keywords = {Text mining},
  keywords = {Open Innovation},
  keywords = {Literature-based discovery},
  keywords = {R&D management},
  keywords = {Research profiling},
  keywords = {Knowledge discovery },
  abstract = {Open Innovation presses the case for timely and thorough intelligence concerning research and development activities conducted outside one’s organization. To take advantage of this wealth of R&D, one needs to establish a systematic “tech mining” process. We propose a 5-stage framework that extends literature review into research profiling and pattern recognition to answer posed technology management questions. Ultimately one can even discover new knowledge by screening research databases. Once one determines the value in mining external R&D, tough issues remain to be overcome. Technology management has developed a culture that relies more on intuition than on evidence. Changing that culture and implementing effective technical intelligence capabilities is worth the effort. P&G's reported gains in innovation call attention to the huge payoff potential. }
}
@article{O'Leary2011821,
  title = {Blog mining-review and extensions: “From each according to his opinion” },
  journal = {Decision Support Systems },
  volume = {51},
  number = {4},
  pages = {821 - 830},
  year = {2011},
  note = {Recent Advances in Data, Text, and Media Mining & Information Issues in Supply Chain and in Service System Design },
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/j.dss.2011.01.016},
  url = {http://www.sciencedirect.com/science/article/pii/S0167923611000455},
  author = {Daniel E. O'Leary},
  keywords = {Blogs},
  keywords = {Blog mining},
  keywords = {Financial blogs},
  keywords = {Sentiment},
  keywords = {Corporate image},
  keywords = {Public image},
  keywords = {Blogs and Sales },
  abstract = {Blogs provide a type of website that contains information and personal opinions of the individual authors. The purpose of this paper is to review some of the literature aimed at gathering opinion, sentiment and information from blogs. This paper also extends the previous literature in a number of directions, extending the use of knowledge from tags on blogs, finding the need for domain specific terms to capture a richer understanding of mood of a blog and finding a relationship between information in message boards and blogs. The relationship between blog chatter and sales, and blogs and public image are also examined. }
}
@article{Cardol20111390,
  title = {Mitochondrial NADH:ubiquinone oxidoreductase (complex I) in eukaryotes: A highly conserved subunit composition highlighted by mining of protein databases },
  journal = {Biochimica et Biophysica Acta (BBA) - Bioenergetics },
  volume = {1807},
  number = {11},
  pages = {1390 - 1397},
  year = {2011},
  note = {},
  issn = {0005-2728},
  doi = {http://dx.doi.org/10.1016/j.bbabio.2011.06.015},
  url = {http://www.sciencedirect.com/science/article/pii/S0005272811001575},
  author = {Pierre Cardol},
  keywords = {Mitochondrial NADH:ubiquinone oxidoreductase},
  keywords = {Profile-based search},
  keywords = {Eukaryote evolution},
  keywords = {Database mining},
  keywords = {Complex I subunit composition },
  abstract = {Complex I (NADH:ubiquinone oxidoreductase) is the largest enzyme of the mitochondrial respiratory chain. Compared to its bacterial counterpart which encompasses 14–17 subunits, mitochondrial complex I has almost tripled its subunit composition during evolution of eukaryotes, by recruitment of so-called accessory subunits, part of them being specific to distinct evolutionary lineages. The increasing availability of numerous broadly sampled eukaryotic genomes now enables the reconstruction of the evolutionary history of this large protein complex. Here, a combination of profile-based sequence comparisons and basic structural properties analyses at the protein level enabled to pinpoint homology relationships between complex I subunits from fungi, mammals or green plants, previously identified as subunits. In addition, homologs of at least 40 mammalian complex I subunits are present in representatives of all major eukaryote assemblages, half of them having not been investigated so far (Excavates, Chromalveolates, Amoebozoa). This analysis revealed that complex I was subject to a phenomenal increase in size that predated the diversification of extant eukaryotes, followed by very few lineage-specific additions/losses of subunits. The implications of this subunit conservation for studies of complex I are discussed. }
}
@article{Jonquet2011316,
  title = {\{NCBO\} Resource Index: Ontology-based search and mining of biomedical resources },
  journal = {Web Semantics: Science, Services and Agents on the World Wide Web },
  volume = {9},
  number = {3},
  pages = {316 - 324},
  year = {2011},
  note = {Semantic Web Dynamics Semantic Web Challenge, 2010 },
  issn = {1570-8268},
  doi = {http://dx.doi.org/10.1016/j.websem.2011.06.005},
  url = {http://www.sciencedirect.com/science/article/pii/S1570826811000485},
  author = {Clement Jonquet and Paea LePendu and Sean Falconer and Adrien Coulet and Natalya F. Noy and Mark A. Musen and Nigam H. Shah},
  keywords = {Ontology-based indexing},
  keywords = {Semantic annotation},
  keywords = {Information mining},
  keywords = {Information retrieval},
  keywords = {Biomedical data},
  keywords = {Biomedical ontologies },
  abstract = {The volume of publicly available data in biomedicine is constantly increasing. However, these data are stored in different formats and on different platforms. Integrating these data will enable us to facilitate the pace of medical discoveries by providing scientists with a unified view of this diverse information. Under the auspices of the National Center for Biomedical Ontology (NCBO), we have developed the Resource Index – a growing, large-scale ontology-based index of more than twenty heterogeneous biomedical resources. The resources come from a variety of repositories maintained by organizations from around the world. We use a set of over 200 publicly available ontologies contributed by researchers in various domains to annotate the elements in these resources. We use the semantics that the ontologies encode, such as different properties of classes, the class hierarchies, and the mappings between ontologies, in order to improve the search experience for the Resource Index user. Our user interface enables scientists to search the multiple resources quickly and efficiently using domain terms, without even being aware that there is semantics “under the hood.” }
}
@article{Chen20111264,
  title = {A personal route prediction system based on trajectory data mining },
  journal = {Information Sciences },
  volume = {181},
  number = {7},
  pages = {1264 - 1284},
  year = {2011},
  note = {},
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2010.11.035},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025510005888},
  author = {Ling Chen and Mingqi Lv and Qian Ye and Gencai Chen and John Woodward},
  keywords = {Data mining},
  keywords = {\{GPS\}},
  keywords = {Route pattern},
  keywords = {Route prediction},
  keywords = {Privacy },
  abstract = {This paper presents a system where the personal route of a user is predicted using a probabilistic model built from the historical trajectory data. Route patterns are extracted from personal trajectory data using a novel mining algorithm, Continuous Route Pattern Mining (CRPM), which can tolerate different kinds of disturbance in trajectory data. Furthermore, a client–server architecture is employed which has the dual purpose of guaranteeing the privacy of personal data and greatly reducing the computational load on mobile devices. An evaluation using a corpus of trajectory data from 17 people demonstrates that \{CRPM\} can extract longer route patterns than current methods. Moreover, the average correct rate of one step prediction of our system is greater than 71%, and the average Levenshtein distance of continuous route prediction of our system is about 30% shorter than that of the Markov model based method. }
}
@article{CruzRoa201191,
  title = {Visual pattern mining in histology image collections using bag of features },
  journal = {Artificial Intelligence in Medicine },
  volume = {52},
  number = {2},
  pages = {91 - 106},
  year = {2011},
  note = {Artificial Intelligence in Medicine \{AIME\} 2009 },
  issn = {0933-3657},
  doi = {http://dx.doi.org/10.1016/j.artmed.2011.04.010},
  url = {http://www.sciencedirect.com/science/article/pii/S0933365711000510},
  author = {Angel Cruz-Roa and Juan C. Caicedo and Fabio A. González},
  keywords = {Collection-based image analysis},
  keywords = {Visual pattern mining},
  keywords = {Visual knowledge discovery},
  keywords = {Bag of features (BOF)},
  keywords = {Visual-codebook feature selection},
  keywords = {Kernel-based image annotation},
  keywords = {Identification of visual patterns},
  keywords = {Histology and histopathology images},
  keywords = {Basal-cell carcinoma},
  keywords = {Fundamental tissues },
  abstract = {Objective The paper addresses the problem of finding visual patterns in histology image collections. In particular, it proposes a method for correlating basic visual patterns with high-level concepts combining an appropriate image collection representation with state-of-the-art machine learning techniques. Methodology The proposed method starts by representing the visual content of the collection using a bag-of-features strategy. Then, two main visual mining tasks are performed: finding associations between visual-patterns and high-level concepts, and performing automatic image annotation. Associations are found using minimum-redundancy-maximum-relevance feature selection and co-clustering analysis. Annotation is done by applying a support-vector-machine classifier. Additionally, the proposed method includes an interpretation mechanism that associates concept annotations with corresponding image regions. The method was evaluated in two data sets: one comprising histology images from the different four fundamental tissues, and the other composed of histopathology images used for cancer diagnosis. Different visual-word representations and codebook sizes were tested. The performance in both concept association and image annotation tasks was qualitatively and quantitatively evaluated. Results The results show that the method is able to find highly discriminative visual features and to associate them to high-level concepts. In the annotation task the method showed a competitive performance: an increase of 21% in f-measure with respect to the baseline in the histopathology data set, and an increase of 47% in the histology data set. Conclusions The experimental evidence suggests that the bag-of-features representation is a good alternative to represent visual content in histology images. The proposed method exploits this representation to perform visual pattern mining from a wider perspective where the focus is the image collection as a whole, rather than individual images. }
}
@article{Poelmans20113870,
  title = {Text mining with emergent self organizing maps and multi-dimensional scaling: A comparative study on domestic violence },
  journal = {Applied Soft Computing },
  volume = {11},
  number = {4},
  pages = {3870 - 3876},
  year = {2011},
  note = {},
  issn = {1568-4946},
  doi = {http://dx.doi.org/10.1016/j.asoc.2011.02.026},
  url = {http://www.sciencedirect.com/science/article/pii/S1568494611000871},
  author = {Jonas Poelmans and Marc M. Van Hulle and Stijn Viaene and Paul Elzinga and Guido Dedene},
  keywords = {Emergent self organizing map (ESOM)},
  keywords = {Multi-dimensional scaling (MDS)},
  keywords = {Domestic violence},
  keywords = {Exploratory data analysis},
  keywords = {Knowledge discovery in databases},
  keywords = {Text mining },
  abstract = {In this paper we compare the usability of \{ESOM\} and \{MDS\} as text exploration instruments in police investigations. We combine them with traditional classification instruments such as the \{SVM\} and Naïve Bayes. We perform a case of real-life data mining using a dataset consisting of police reports describing a wide range of violent incidents that occurred during the year 2007 in the Amsterdam-Amstelland police region (The Netherlands). We compare the possibilities offered by the \{ESOM\} and \{MDS\} for iteratively enriching our feature set, discovering confusing situations, faulty case labelings and significantly improving the classification accuracy. The results of our research are currently operational in the Amsterdam-Amstelland police region for upgrading the employed domestic violence definition, for improving the training of police officers and for developing a highly accurate and comprehensible case triage model. }
}
@article{Fu2011164,
  title = {A review on time series data mining },
  journal = {Engineering Applications of Artificial Intelligence },
  volume = {24},
  number = {1},
  pages = {164 - 181},
  year = {2011},
  note = {},
  issn = {0952-1976},
  doi = {http://dx.doi.org/10.1016/j.engappai.2010.09.007},
  url = {http://www.sciencedirect.com/science/article/pii/S0952197610001727},
  author = {Tak-chung Fu},
  keywords = {Time series data mining},
  keywords = {Representation},
  keywords = {Similarity measure},
  keywords = {Segmentation},
  keywords = {Visualization },
  abstract = {Time series is an important class of temporal data objects and it can be easily obtained from scientific and financial applications. A time series is a collection of observations made chronologically. The nature of time series data includes: large in data size, high dimensionality and necessary to update continuously. Moreover time series data, which is characterized by its numerical and continuous nature, is always considered as a whole instead of individual numerical field. The increasing use of time series data has initiated a great deal of research and development attempts in the field of data mining. The abundant research on time series data mining in the last decade could hamper the entry of interested researchers, due to its complexity. In this paper, a comprehensive revision on the existing time series data mining research is given. They are generally categorized into representation and indexing, similarity measure, segmentation, visualization and mining. Moreover state-of-the-art research issues are also highlighted. The primary objective of this paper is to serve as a glossary for interested researchers to have an overall picture on the current time series data mining development and identify their potential research direction to further investigation. }
}
@article{Laurence2011278,
  title = {Establishing a sustainable mining operation: an overview },
  journal = {Journal of Cleaner Production },
  volume = {19},
  number = {2–3},
  pages = {278 - 284},
  year = {2011},
  note = {},
  issn = {0959-6526},
  doi = {http://dx.doi.org/10.1016/j.jclepro.2010.08.019},
  url = {http://www.sciencedirect.com/science/article/pii/S0959652610003471},
  author = {David Laurence},
  keywords = {Sustainable mining},
  keywords = {Premature mine closures},
  keywords = {Community},
  keywords = {Economy},
  keywords = {Environment},
  keywords = {Safety},
  keywords = {Resource efficiency },
  abstract = {In a review of the literature on sustainability in mining, it was found that there is limited guidance for mine operators to put sustainability frameworks and theory into action on the ground. This paper argues that operators can improve the sustainability of their mine sites by ensuring that leading practices are implemented in five areas. In addition to the widely-accepted dimensions of Environment, Economic and Community, Safety and Resource Efficiency must be addressed. The need for highlighting these additional elements is demonstrated in an analysis of over one thousand unplanned or prematurely closed mines over the past 30 years. }
}
@article{Vo20151684,
  title = {Learning to classify short text from scientific documents using topic models with various types of knowledge },
  journal = {Expert Systems with Applications },
  volume = {42},
  number = {3},
  pages = {1684 - 1698},
  year = {2015},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2014.09.031},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417414005764},
  author = {Duc-Thuan Vo and Cheol-Young Ock},
  keywords = {Data sparseness},
  keywords = {Information retrieval},
  keywords = {Latent Dirichlet Allocation},
  keywords = {Short text classification},
  keywords = {Topic model },
  abstract = {Abstract Classification of short text is challenging due to data sparseness, which is a typical characteristic of short text. In this paper, we propose methods for enhancing features using topic models, which make short text seem less sparse and more topic-oriented for classification. We exploited topic model analysis based on Latent Dirichlet Allocation for enriched datasets, and then we presented new methods for enhancing features by combining external texts from topic models that make documents more effective for classification. In experiments, we utilized the title contents of scientific articles as short text documents, and then enriched these documents using topic models from various types of universal datasets for classification in order to show that our approach performs efficiently. }
}
@article{Mohammad20111237,
  title = {A novel intrusion detection system by using intelligent data mining in weka environment },
  journal = {Procedia Computer Science },
  volume = {3},
  number = {0},
  pages = {1237 - 1242},
  year = {2011},
  note = {World Conference on Information Technology },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2010.12.198},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050910005739},
  author = {Muamer N. Mohammad and Norrozila Sulaiman and Osama Abdulkarim Muhsin},
  keywords = {Data mining},
  keywords = {Intrusion detection system},
  keywords = {\{WEKA\} },
  abstract = {Nowadays, the using of intelligent data mining approaches to predict intrusion in local area networks has been increasing rapidly. In this paper, an improved approach for Intrusion Detection System (IDS) based on combining data mining and expert system is presented and implemented in WEKA. The taxonomy consists of a classification of the detection principle as well as certain \{WEKA\} aspects of the intrusion detection system such as open-source data mining. The combining methods may give better performance of \{IDS\} systems, and make the detection more effective. The result of the evaluation of the new design produced a better result in terms of detection efficiency and false alarm rate from the existing problems. This presents useful information in intrusion detection. }
}
@article{Tzanis201112398,
  title = {PolyA-iEP: A data mining method for the effective prediction of polyadenylation sites },
  journal = {Expert Systems with Applications },
  volume = {38},
  number = {10},
  pages = {12398 - 12408},
  year = {2011},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2011.04.019},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417411005355},
  author = {George Tzanis and Ioannis Kavakiotis and Ioannis Vlahavas},
  keywords = {Data mining},
  keywords = {Machine learning},
  keywords = {Classification},
  keywords = {Emerging pattern},
  keywords = {Bioinformatics},
  keywords = {Polyadenylation },
  abstract = {This paper presents a study on polyadenylation site prediction, which is a very important problem in bioinformatics and medicine, promising to give a lot of answers especially in cancer research. We describe a method, called PolyA-iEP, that we developed for predicting polyadenylation sites and we present a systematic study of the problem of recognizing mRNA 3′ ends which contain a polyadenylation site using the proposed method. PolyA-iEP is a modular system consisting of two main components that both contribute substantially to the descriptive and predictive potential of the system. In specific, PolyA-iEP exploits the advantages of emerging patterns, namely high understandability and discriminating power and the strength of a distance-based scoring method that we propose. The extracted emerging patterns may span across many elements around the polyadenylation site and can provide novel and interesting biological insights. The outputs of these two components are finally combined by a classifier in a highly effective framework, which in our setup reaches 93.7% of sensitivity and 88.2% of specificity. PolyA-iEP can be parameterized and used for both descriptive and predictive analysis. We have experimented with Arabidopsis thaliana sequences for evaluating our method and we have drawn important conclusions. }
}
@article{Yin20115737,
  title = {Experimental study on fighters behaviors mining },
  journal = {Expert Systems with Applications },
  volume = {38},
  number = {5},
  pages = {5737 - 5747},
  year = {2011},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2010.10.059},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417410012030},
  author = {Yunfei Yin and Guanghong Gong and Liang Han},
  keywords = {Experimental study},
  keywords = {Fighters behaviors},
  keywords = {Data mining},
  keywords = {Patterns},
  keywords = {Basic Fighter Maneuvers (BFMs) },
  abstract = {Effective prediction for fighters behaviors is crucial for air-combats as well as for many other game fields. In this paper, we present three patterns to predict the behaviors of fighters that are the ActionStreams pattern, the Owner_Actions pattern and the Time_Owner_Actions pattern, where: (1) ActionStreams pattern is a coarse granular for describing the fighter’s behaviors with action identifier whereas without distinguishing the time and the executor/owner; (2) Owner_Actions pattern is a finer granular for describing the fighter’s behaviors with the action identifier and the executor whereas without distinguishing the time; and (3) Time_Owner_Actions pattern encapsulates the action identifier, the time, and also the executor. Based on such fighters’ behaviors patterns, we explore the data structures used to store and the satisfied properties used to mine; and further, by designing and implementing the relevant mining/processing algorithms and systems, we have discovered some experience patterns of the fighters’ behaviors and have conducted certain valid predictions for the fighters’ behaviors. We also present the experimental results conducted on the simulation platform of the air to air combats. The results show that our method is effective. }
}
@article{Yang20151340,
  title = {A novel contextual topic model for multi-document summarization },
  journal = {Expert Systems with Applications },
  volume = {42},
  number = {3},
  pages = {1340 - 1352},
  year = {2015},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2014.09.015},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417414005545},
  author = {Guangbing Yang and Dunwei Wen and Kinshuk and Nian-Shing Chen and Erkki Sutinen},
  keywords = {Multi-document summarization},
  keywords = {Hierarchical topic model},
  keywords = {Contextual topic },
  abstract = {Abstract Information overload becomes a serious problem in the digital age. It negatively impacts understanding of useful information. How to alleviate this problem is the main concern of research on natural language processing, especially multi-document summarization. With the aim of seeking a new method to help justify the importance of similar sentences in multi-document summarizations, this study proposes a novel approach based on recent hierarchical Bayesian topic models. The proposed model incorporates the concepts of n-grams into hierarchically latent topics to capture the word dependencies that appear in the local context of a word. The quantitative and qualitative evaluation results show that this model has outperformed both hLDA and \{LDA\} in document modeling. In addition, the experimental results in practice demonstrate that our summarization system implementing this model can significantly improve the performance and make it comparable to the state-of-the-art summarization systems. }
}
@article{Geng2015236,
  title = {Prediction of financial distress: An empirical study of listed Chinese companies using data mining },
  journal = {European Journal of Operational Research },
  volume = {241},
  number = {1},
  pages = {236 - 247},
  year = {2015},
  note = {},
  issn = {0377-2217},
  doi = {http://dx.doi.org/10.1016/j.ejor.2014.08.016},
  url = {http://www.sciencedirect.com/science/article/pii/S0377221714006511},
  author = {Ruibin Geng and Indranil Bose and Xi Chen},
  keywords = {Chinese companies},
  keywords = {Financial distress},
  keywords = {Financial indicators},
  keywords = {Neural network},
  keywords = {Majority voting },
  abstract = {Abstract The deterioration in profitability of listed companies not only threatens the interests of the enterprise and internal staff, but also makes investors face significant financial loss. It is important to establish an effective early warning system for prediction of financial crisis for better corporate governance. This paper studies the phenomenon of financial distress for 107 Chinese companies that received the label ‘special treatment’ from 2001 to 2008 by the Shanghai Stock Exchange and the Shenzhen Stock Exchange. We use data mining techniques to build financial distress warning models based on 31 financial indicators and three different time windows by comparing these 107 firms to a control group of firms. We observe that the performance of neural networks is more accurate than other classifiers, such as decision trees and support vector machines, as well as an ensemble of multiple classifiers combined using majority voting. An important contribution of the paper is to discover that financial indicators, such as net profit margin of total assets, return on total assets, earnings per share, and cash flow per share, play an important role in prediction of deterioration in profitability. This paper provides a suitable method for prediction of financial distress for listed companies in China. }
}
@article{Yang2011647,
  title = {Multilingual document mining and navigation using self-organizing maps },
  journal = {Information Processing & Management },
  volume = {47},
  number = {5},
  pages = {647 - 666},
  year = {2011},
  note = {Managing and Mining Multilingual Documents },
  issn = {0306-4573},
  doi = {http://dx.doi.org/10.1016/j.ipm.2009.12.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0306457309001423},
  author = {Hsin-Chang Yang and Han-Wei Hsiao and Chung-Hong Lee},
  keywords = {Multilingual Web page navigation},
  keywords = {Multilingual text mining},
  keywords = {Self-organizing map},
  keywords = {Hierarchy alignment },
  abstract = {One major approach for information finding in the \{WWW\} is to navigate through some Web directories and browse them until the goal pages were found. However, such directories are generally constructed manually and may have disadvantages of narrow coverage and inconsistency. Besides, most of existing directories provide only monolingual hierarchies that organized Web pages in terms that a user may not be familiar with. In this work, we will propose an approach that could automatically arrange multilingual Web pages into a multilingual Web directory to break the language barriers in Web navigation. In this approach, a self-organizing map is constructed to train each set of monolingual Web pages and obtain two feature maps, which reveal the relationships among Web pages and thematic keywords, respectively, for such language. We then apply a hierarchy generation process on these maps to obtain the monolingual hierarchy for these Web pages. A hierarchy alignment method is then applied on these monolingual hierarchies to discover the associations between nodes in different hierarchies. Finally, a multilingual Web directory is constructed according to such associations. We applied the proposed approach on a set of Web pages and obtained interesting result that demonstrates the feasibility of our method in multilingual Web navigation. }
}
@article{Pang2011352,
  title = {Summarizing tourist destinations by mining user-generated travelogues and photos },
  journal = {Computer Vision and Image Understanding },
  volume = {115},
  number = {3},
  pages = {352 - 363},
  year = {2011},
  note = {Special issue on Feature-Oriented Image and Video Computing for Extracting Contexts and Semantics },
  issn = {1077-3142},
  doi = {http://dx.doi.org/10.1016/j.cviu.2010.10.010},
  url = {http://www.sciencedirect.com/science/article/pii/S1077314210002419},
  author = {Yanwei Pang and Qiang Hao and Yuan Yuan and Tanji Hu and Rui Cai and Lei Zhang},
  keywords = {Travelogue mining},
  keywords = {Destination summarization},
  keywords = {User-generated content},
  keywords = {Virtual tour },
  abstract = {Automatically summarizing tourist destinations with both textual and visual descriptions is highly desired for online services such as travel planning, to facilitate users to understand the local characteristics of tourist destinations. Travelers are contributing a great deal of user-generated travelogues and photos on the Web, which contain abundant travel-related information and cover various aspects (e.g., landmarks, styles, activities) of most locations in the world. To leverage the collective knowledge of travelers for destination summarization, in this paper we propose a framework which discovers location-representative tags from travelogues and then select relevant and representative photos to visualize these tags. The learnt tags and selected photos are finally organized appropriately to provide an informative summary which describes a given destination both textually and visually. Experimental results based on a large collection of travelogues and photos show promising results on destination summarization. }
}
@article{Thorleuchter20107182,
  title = {Mining ideas from textual information },
  journal = {Expert Systems with Applications },
  volume = {37},
  number = {10},
  pages = {7182 - 7188},
  year = {2010},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2010.04.013},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417410002848},
  author = {Dirk Thorleuchter and Dirk Van den Poel and Anita Prinzie},
  keywords = {Idea mining},
  keywords = {Text mining},
  keywords = {Text classification},
  keywords = {Technology },
  abstract = {This approach introduces idea mining as process of extracting new and useful ideas from unstructured text. We use an idea definition from technique philosophy and we focus on ideas that can be used to solve technological problems. The rationale for the idea mining approach is taken over from psychology and cognitive science and follows how persons create ideas. To realize the processing, we use methods from text mining and text classification (tokenization, term filtering methods, Euclidean distance measure etc.) and combine them with a new heuristic measure for mining ideas. As a result, the idea mining approach extracts automatically new and useful ideas from an user given text. We present these problem solution ideas in a comprehensible way to support users in problem solving. This approach is evaluated with patent data and it is realized as a web-based application, named ‘Technological Idea Miner’ that can be used for further testing and evaluation. }
}
@article{vanderAalst2011450,
  title = {Time prediction based on process mining },
  journal = {Information Systems },
  volume = {36},
  number = {2},
  pages = {450 - 475},
  year = {2011},
  note = {Special Issue: Semantic Integration of Data, Multimedia, and Services },
  issn = {0306-4379},
  doi = {http://dx.doi.org/10.1016/j.is.2010.09.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0306437910000864},
  author = {W.M.P. van der Aalst and M.H. Schonenberg and M. Song},
  keywords = {Process mining},
  keywords = {Business process management},
  keywords = {Time prediction},
  keywords = {Performance analysis},
  keywords = {Business intelligence },
  abstract = {Process mining allows for the automated discovery of process models from event logs. These models provide insights and enable various types of model-based analysis. This paper demonstrates that the discovered process models can be extended with information to predict the completion time of running instances. There are many scenarios where it is useful to have reliable time predictions. For example, when a customer phones her insurance company for information about her insurance claim, she can be given an estimate for the remaining processing time. In order to do this, we provide a configurable approach to construct a process model, augment this model with time information learned from earlier instances, and use this to predict e.g., the completion time. To provide meaningful time predictions we use a configurable set of abstractions that allow for a good balance between “overfitting” and “underfitting”. The approach has been implemented in ProM and through several experiments using real-life event logs we demonstrate its applicability. }
}
@article{Park2011497,
  title = {Foresighted tree configuration games in resource constrained distributed stream mining sensors },
  journal = {Ad Hoc Networks },
  volume = {9},
  number = {4},
  pages = {497 - 513},
  year = {2011},
  note = {Multimedia Ad Hoc and Sensor Networks },
  issn = {1570-8705},
  doi = {http://dx.doi.org/10.1016/j.adhoc.2010.08.007},
  url = {http://www.sciencedirect.com/science/article/pii/S1570870510001034},
  author = {Hyunggon Park and Deepak S. Turaga and Olivier Verscheure and Mihaela van der Schaar},
  keywords = {Resource constrained stream mining sensors},
  keywords = {Tree configuration games},
  keywords = {Myopic and foresighted strategies},
  keywords = {Coalitions for distributed classifiers},
  keywords = {Binary classifier trees },
  abstract = {We consider the problem of optimizing stream mining applications that are constructed as tree topologies of classifiers and deployed on a set of resource constrained and distributed processing nodes (or sensors). The optimization involves selecting appropriate false-alarm detection tradeoffs (operating points) for each classifier to minimize an end-to-end misclassification penalty, while satisfying resource constraints. We design distributed solutions, by defining tree configuration games, where individual classifiers configure themselves to maximize an appropriate local utility. We define the local utility functions and determine the information that needs to be exchanged across classifiers in order to design the distributed solutions. We analytically show that there is a unique pure strategy Nash equilibrium in operating points, which guarantees convergence of the proposed approach. We develop both myopic strategy, where the utility is purely local to the current classifier, and foresighted strategy, where the utility includes impact of classifier’s actions on successive classifiers. We analytically show that actions determined based on foresighted strategies improve the end-to-end performance of the classifier tree, by deriving an associated probability bound. We also investigate the impact of resource constraints on the classifier action selections for each strategy, and the corresponding application performance. We propose a learning-based approach, which enables each classifier to effectively adapt to the dynamic changes of resource constraints. We evaluate the performance of our solutions on an application for sports scene classification. We show that foresighted strategies result in better performance than myopic strategies in both resource unconstrained and resource constrained scenarios, and asymptotically approach the centralized optimal solution. We also show that the proposed distributed solutions outperform the centralized solution based on the Sequential Quadratic Programming on average in resource unconstrained scenarios. }
}
@article{Li20108850,
  title = {Mining frequent patterns from network flows for monitoring network },
  journal = {Expert Systems with Applications },
  volume = {37},
  number = {12},
  pages = {8850 - 8860},
  year = {2010},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2010.06.012},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417410005166},
  author = {Xin Li and Zhi-Hong Deng},
  keywords = {Network monitoring},
  keywords = {Anomaly detection},
  keywords = {Frequent pattern mining},
  keywords = {Sliding window },
  abstract = {Because of the varying and dynamic characteristics of network traffic, such as fast transfer, huge volume, shot-lived, inestimable and infinite, it is a serious challenge for network administrators to monitor network traffic in real time and judge whether the whole network works well. Currently, most of the existing techniques in this area are based on signature training, learning or matching, which may be too complicated to satisfy timely requirements. Other statistical methods including sampling, hashing or counting are all approximate methods and compute an incomplete set of results. Since the main objective of network monitoring is to discover and understand the active events that happen frequently and may influence or even ruin the total network. So in the paper we aim to use the technique of frequent pattern mining to find out these events. We first design a sliding window model to make sure the mining result novel and integrated; then, under the consideration of the distribution and fluidity of network flows, we develop a powerful class of algorithms that contains vertical re-mining algorithm, multi-pattern re-mining algorithm, fast multi-pattern capturing algorithm and fast multi-pattern capturing supplement algorithm to deal with a series of problems when applying frequent pattern mining algorithm in network traffic analysis. Finally, we develop a monitoring system to evaluate our algorithms on real traces collected from the campus network of Peking University. The results show that some given algorithms are effective enough and our system can definitely identify a lot of potentially very valuable information in time which greatly help network administrators to understand regular applications and detect network anomalies. So the research in this paper not only provides a new application area for frequent pattern mining, but also provides a new technique for network monitoring. }
}
@article{Tsai200911617,
  title = {Mining frequent itemsets in data streams using the weighted sliding window model },
  journal = {Expert Systems with Applications },
  volume = {36},
  number = {9},
  pages = {11617 - 11625},
  year = {2009},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2009.03.025},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417409002723},
  author = {Pauray S.M. Tsai},
  keywords = {Data mining},
  keywords = {Data stream},
  keywords = {Weighted sliding window model},
  keywords = {Association rule},
  keywords = {Frequent itemset },
  abstract = {In recent years, data stream mining has become an important research topic. With the emergence of new applications, the data we process are not again static, but the continuous dynamic data stream. Examples include network traffic analysis, Web click stream mining, network intrusion detection, and on-line transaction analysis. In this paper, we propose a new framework for data stream mining, called the weighted sliding window model. The proposed model allows the user to specify the number of windows for mining, the size of a window, and the weight for each window. Thus users can specify a higher weight to a more significant data section, which will make the mining result closer to user’s requirements. Based on the weighted sliding window model, we propose a single pass algorithm, called WSW, to efficiently discover all the frequent itemsets from data streams. By analyzing data characteristics, an improved algorithm, called WSW-Imp, is developed to further reduce the time of deciding whether a candidate itemset is frequent or not. Empirical results show that WSW-Imp outperforms \{WSW\} under the weighted sliding window model. }
}
@article{Xie20114008,
  title = {Data mining of graduation project selection database },
  journal = {Procedia Engineering },
  volume = {15},
  number = {0},
  pages = {4008 - 4011},
  year = {2011},
  note = {\{CEIS\} 2011 },
  issn = {1877-7058},
  doi = {http://dx.doi.org/10.1016/j.proeng.2011.08.751},
  url = {http://www.sciencedirect.com/science/article/pii/S1877705811022521},
  author = {Wu Xie and Huimin Zhang and Bizhong Wei and Guanghai Fang},
  keywords = {Data mining},
  keywords = {database system},
  keywords = {decision tree},
  keywords = {classification},
  keywords = {graduation},
  keywords = {selection },
  abstract = {In order to improve the quality of graduate project, a database system of hundreds of graduation project selection results was established in C# language of Visual Studio. The information system was data mined with \{ID3\} algorithm, and a decision tree is gained for researching these graduation projects choices. The mining results of software testing demonstrate that the quality of graduation project selection is associated with the difficulty, project direction and major direction mostly, which guilds the students to choose their suitable graduation projects in time, improving the efficiency and quality of graduation project selection greatly. }
}
@article{Wang2014107,
  title = {Latent feature mining of spatial and marginal characteristics for mammographic mass classification },
  journal = {Neurocomputing },
  volume = {144},
  number = {0},
  pages = {107 - 118},
  year = {2014},
  note = {},
  issn = {0925-2312},
  doi = {http://dx.doi.org/10.1016/j.neucom.2013.11.050},
  url = {http://www.sciencedirect.com/science/article/pii/S092523121400736X},
  author = {Ying Wang and Jie Li and Xinbo Gao},
  keywords = {Mammographic mass classification},
  keywords = {\{LDA\}},
  keywords = {Polarized spatial pyramid},
  keywords = {Statistical marginal characteristic},
  keywords = {Computer-aided diagnosis },
  abstract = {Abstract Mass classification is one of the key procedures in mammography computer-aided diagnosis (CAD) system, which is widely applied to help improving clinic diagnosis performance. In literature, classical mass classification systems always employ a large number and types of features for discriminating masses. This will produce higher computational complexity. And the incompatibility among various features also may introduce some negative impact to classification accuracy. Furthermore, latent characteristics of masses are seldom considered in the present scheme, which are useful to reveal hidden distribution pattern of masses. For the above purpose, the paper proposes a new mammographic mass classification scheme. Mammograms are detected and segmented first for obtaining region of interests with masses (ROIms). Then Latent Dirichlet Allocation (LDA) is introduced to find hidden topic distribution of ROIms. A special spatial pyramid structure is proposed and incorporated with \{LDA\} for capturing latent spatial characteristics of ROIms. For mining latent statistical marginal characteristics of masses, local patches on segmented boundary are extracted to construct a special document for LDA. Finally, all the latent topics will be combined, analyzed and classified by employing the \{SVM\} classifier. The experimental results on a dataset in \{DDSM\} demonstrate the effectiveness and efficiency of the proposed classification scheme. }
}
@article{Chandanan2015143,
  title = {Removal of Duplicate Rules for Association Rule Mining from Multilevel Dataset },
  journal = {Procedia Computer Science },
  volume = {45},
  number = {0},
  pages = {143 - 149},
  year = {2015},
  note = {International Conference on Advanced Computing Technologies and Applications (ICACTA) },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2015.03.106},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050915003427},
  author = {A.K. Chandanan and M.K. Shukla},
  keywords = {Association Rule},
  keywords = {Duplicate Rules},
  keywords = {Multi-level and quantative data },
  abstract = {Abstract Association rules are one of the most researched areas of data mining. This is useful in the marketing and retailing strategies. Association mining is to retrieval of a set of attributes shared with a large number of objects in a given database. There are many potential application areas for association rule approach which include design, layout, and customer segregation and so on. The redundancy in association rules affects the quality of the information presented. The goal of redundancy elimination is to improve the quality and usefulness of the rules. Our work aims is to remove hierarchical duplicacy in multi-level, thus reducing the size of the rule set to improve the quality and usefulness without any loss. }
}
@article{Cohen201167,
  title = {What can instructors and policy makers learn about Web-supported learning through Web-usage mining },
  journal = {The Internet and Higher Education },
  volume = {14},
  number = {2},
  pages = {67 - 76},
  year = {2011},
  note = {Web mining and higher education: Introduction to the special issue },
  issn = {1096-7516},
  doi = {http://dx.doi.org/10.1016/j.iheduc.2010.07.008},
  url = {http://www.sciencedirect.com/science/article/pii/S1096751610000631},
  author = {Anat Cohen and Rafi Nachmias},
  keywords = {Web-supported learning},
  keywords = {Web-usage mining},
  keywords = {Campus-wide},
  keywords = {Computational measures },
  abstract = {This paper focuses on a Web-log based tool for evaluating pedagogical processes occurring in Web-supported academic instruction and students' attitudes. The tool consists of computational measures which demonstrate what instructors and policy makers can learn about Web-supported instruction through Web-usage mining. The tool can provide different measures and reports for instructors at the micro level, and for policy makers at the macro level. The instructors' reports provide feedback relating to the pedagogical processes in their course Websites in comparison to other similar courses on campus. The policy makers' reports provide data about the extent of use of course Websites across the campus, the benefits of such use, and the return on investment. This paper describes the tool and its computational measures as well as its implementation, first on a sample course and next on 3453 course Websites at Tel-Aviv University. }
}
@article{Xu2011743,
  title = {Mining comparative opinions from customer reviews for Competitive Intelligence },
  journal = {Decision Support Systems },
  volume = {50},
  number = {4},
  pages = {743 - 754},
  year = {2011},
  note = {Enterprise Risk and Security Management: Data, Text and Web Mining },
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/j.dss.2010.08.021},
  url = {http://www.sciencedirect.com/science/article/pii/S0167923610001454},
  author = {Kaiquan Xu and Stephen Shaoyi Liao and Jiexun Li and Yuxia Song},
  keywords = {Opinion mining},
  keywords = {Enterprise risk management},
  keywords = {Competitive Intelligence },
  abstract = {Competitive Intelligence is one of the key factors for enterprise risk management and decision support. However, the functions of Competitive Intelligence are often greatly restricted by the lack of sufficient information sources about the competitors. With the emergence of Web 2.0, the large numbers of customer-generated product reviews often contain information about competitors and have become a new source of mining Competitive Intelligence. In this study, we proposed a novel graphical model to extract and visualize comparative relations between products from customer reviews, with the interdependencies among relations taken into consideration, to help enterprises discover potential risks and further design new products and marketing strategies. Our experiments on a corpus of Amazon customer reviews show that our proposed method can extract comparative relations more accurately than the benchmark methods. Furthermore, this study opens a door to analyzing the rich consumer-generated data for enterprise risk management. }
}
@article{Li2015811,
  title = {Supervised topic models for multi-label classification },
  journal = {Neurocomputing },
  volume = {149, Part B},
  number = {0},
  pages = {811 - 819},
  year = {2015},
  note = {},
  issn = {0925-2312},
  doi = {http://dx.doi.org/10.1016/j.neucom.2014.07.053},
  url = {http://www.sciencedirect.com/science/article/pii/S0925231214010054},
  author = {Ximing Li and Jihong Ouyang and Xiaotang Zhou},
  keywords = {Supervised topic model},
  keywords = {Multi-label classification},
  keywords = {Label frequency},
  keywords = {Label dependency },
  abstract = {Abstract Recently, some publications indicated that the generative modeling approaches, i.e., topic models, achieved appreciated performance on multi-label classification, especially for skewed data sets. In this paper, we develop two supervised topic models for multi-label classification problems. The two models, i.e., Frequency-LDA (FLDA) and Dependency-Frequency-LDA (DFLDA), extend Latent Dirichlet Allocation (LDA) via two observations, i.e., the frequencies of the labels and the dependencies among different labels. We train the models by the Gibbs sampler algorithm. The experiment results on well known collections demonstrate that our two models outperform the state-of-the-art approaches. }
}
@article{Hsu20113431,
  title = {EduMiner: Using text mining for automatic formative assessment },
  journal = {Expert Systems with Applications },
  volume = {38},
  number = {4},
  pages = {3431 - 3439},
  year = {2011},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2010.08.129},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417410009371},
  author = {Jung-Lung Hsu and Huey-Wen Chou and Hsiu-Hua Chang},
  keywords = {E-learning},
  keywords = {Formative assessment},
  keywords = {Collective cognition},
  keywords = {Text mining },
  abstract = {Formative assessment and summative assessment are two widely accepted approaches of assessment. While summative assessment is a typically formal assessment used at the end of a lesson or course, formative assessment is an ongoing process of monitoring learners’ progresses of knowledge construction. Although empirical evidence has acknowledged that formal assessment is indeed superior to summative assessment, current e-learning assessment systems however seldom provide plausible solutions for conducting formative assessment. The major bottleneck of putting formative assessment into practice lies in its labor-intensive and time-consuming nature, which makes it hardly a feasible way of achievement evaluation especially when there are usually a large number of learners in e-learning environment. In this regard, this study developed EduMiner to relieve the burdens imposed on instructors and learners by capitalizing a series of text mining techniques. An empirical study was held to examine effectiveness and to explore outcomes of the features that EduMiner supported. In this study 56 participants enrolling in a “Human Resource Management” course were randomly divided into either experimental groups or control groups. Results of this study indicated that the algorithms introduced in this study serve as a feasible approach for conducting formative assessment in e-learning environment. In addition, learners in experimental groups were highly motivated to phrase the contents with higher-order level of cognition. Therefore a timely feedback of visualized representations is beneficial to facilitate online learners to express more in-depth ideas in discourses. }
}
@article{Brescia2010845,
  title = {Mining knowledge in astrophysical massive data sets },
  journal = {Nuclear Instruments and Methods in Physics Research Section A: Accelerators, Spectrometers, Detectors and Associated Equipment },
  volume = {623},
  number = {2},
  pages = {845 - 849},
  year = {2010},
  note = {1rs International Conference on Frontiers in Diagnostics Technologies },
  issn = {0168-9002},
  doi = {http://dx.doi.org/10.1016/j.nima.2010.02.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0168900210001804},
  author = {Massimo Brescia and Giuseppe Longo and Fabio Pasian},
  keywords = {Astrophysics},
  keywords = {Astroinformatics},
  keywords = {Data mining},
  keywords = {Virtual observatory},
  keywords = {Distributed computing},
  keywords = {Knowledge discovery},
  keywords = {Machine learning },
  abstract = {Modern scientific data mainly consist of huge data sets gathered by a very large number of techniques and stored in much diversified and often incompatible data repositories. More in general, in the e-science environment, it is considered as a critical and urgent requirement to integrate services across distributed, heterogeneous, dynamic “virtual organizations” formed by different resources within a single enterprise. In the last decade, Astronomy has become an immensely data-rich field due to the evolution of detectors (plates to digital to mosaics), telescopes and space instruments. The Virtual Observatory approach consists of the federation under common standards of all astronomical archives available worldwide, as well as data analysis, data mining and data exploration applications. The main drive behind such an effort is that once the infrastructure is complete, it will allow a new type of multi-wavelength, multi-epoch science, which can only be barely imagined. Data mining, or knowledge discovery in databases, while being the main methodology to extract the scientific information contained in such Massive Data Sets (MDS), poses crucial problems since it has to orchestrate complex problems posed by transparent access to different computing environments, scalability of algorithms, reusability of resources, etc. In the present paper we summarize the present status of the \{MDS\} in the Virtual Observatory and what is currently done and planned to bring advanced data mining methodologies in the case of the \{DAME\} (DAta Mining and Exploration) project. }
}
@incollection{McCue201531,
  title = {Chapter 3 - Data Mining and Predictive Analytics },
  editor = {McCue, Colleen },
  booktitle = {Data Mining and Predictive Analysis (Second Edition) },
  publisher = {Butterworth-Heinemann},
  edition = {Second Edition},
  address = {Boston},
  year = {2015},
  pages = {31 - 48},
  isbn = {978-0-12-800229-2},
  doi = {http://dx.doi.org/10.1016/B978-0-12-800229-2.00003-1},
  url = {http://www.sciencedirect.com/science/article/pii/B9780128002292000031},
  author = {Colleen McCue},
  keywords = {Discovery},
  keywords = {prediction},
  keywords = {confirmation},
  keywords = {surprise},
  keywords = {characterization},
  keywords = {exploratory graphics},
  keywords = {data exploration},
  keywords = {link analysis},
  keywords = {nonobvious relationship analysis (NORA) },
  abstract = {Abstract Data mining and predictive analytics support the discovery and characterization of trends, patterns, and relationships in data through the use of exploratory graphics in combination with advanced statistical modeling, machine learning, and artificial intelligence. Results can be understood in terms of their contribution to confirmation and discovery. Confirmation involves the validation, extension, and operationalization of what we know, and discovery includes the identification of new trends, patterns, and relationships. }
}
@article{Herawan2011186,
  title = {A soft set approach for association rules mining },
  journal = {Knowledge-Based Systems },
  volume = {24},
  number = {1},
  pages = {186 - 195},
  year = {2011},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2010.08.005},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705110001346},
  author = {Tutut Herawan and Mustafa Mat Deris},
  keywords = {Association rules mining},
  keywords = {Maximal association rules mining},
  keywords = {Boolean-valued information systems},
  keywords = {Soft set theory},
  keywords = {Items co-occurrence },
  abstract = {In this paper, we present an alternative approach for mining regular association rules and maximal association rules from transactional datasets using soft set theory. This approach is started by a transformation of a transactional dataset into a Boolean-valued information system. Since the “standard” soft set deals with such information system, thus a transactional dataset can be represented as a soft set. Using the concept of parameters co-occurrence in a transaction, we define the notion of regular and maximal association rules between two sets of parameters, also their support, confidence and maximal support, maximal confidences, respectively properly using soft set theory. The results show that the soft regular and soft maximal association rules provide identical rules as compared to the regular and maximal association rules. }
}
@article{Cao20108090,
  title = {Enhancing effectiveness of density-based outlier mining scheme with density-similarity-neighbor-based outlier factor },
  journal = {Expert Systems with Applications },
  volume = {37},
  number = {12},
  pages = {8090 - 8101},
  year = {2010},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2010.05.079},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417410004902},
  author = {Hui Cao and Gangquan Si and Yanbin Zhang and Lixin Jia},
  keywords = {Outlier mining},
  keywords = {k-Density},
  keywords = {\{SDS\}},
  keywords = {\{ASC\}},
  keywords = {\{DSNOF\} },
  abstract = {This paper proposes a density-similarity-neighbor-based outlier mining algorithm for the data preprocess of data mining technique. First, the concept of k-density of an object is presented and the similar density series (SDS) of the object is established based on the changes of the k-density and the neighbors k-densities of the object. Second, the average series cost (ASC) of the object is obtained based on the weighted sum of the distance between the two adjacent objects in \{SDS\} of the object. Finally, the density-similarity-neighbor-based outlier factor (DSNOF) of the object is calculated by using both the \{ASC\} of the object and the \{ASC\} of k-distance neighbors of the object, and the degree of the object being an outlier is indicated by the DSNOF. The experiments are performed on synthetic and real datasets to evaluate the effectiveness and the performance of the proposed algorithm. The experiments results verify that the proposed algorithm has higher quality of outlier mining and do not increase the algorithm complexity. }
}
@article{Gharib2010800,
  title = {An efficient algorithm for incremental mining of temporal association rules },
  journal = {Data & Knowledge Engineering },
  volume = {69},
  number = {8},
  pages = {800 - 815},
  year = {2010},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2010.03.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X10000364},
  author = {Tarek F. Gharib and Hamed Nassar and Mohamed Taha and Ajith Abraham},
  keywords = {Temporal Association Rules (TAR)},
  keywords = {Incremental temporal mining},
  keywords = {Updating temporal association rules},
  keywords = {Temporal mining },
  abstract = {This paper presents the concept of temporal association rules in order to solve the problem of handling time series by including time expressions into association rules. Actually, temporal databases are continually appended or updated so that the discovered rules need to be updated. Re-running the temporal mining algorithm every time is ineffective since it neglects the previously discovered rules, and repeats the work done previously. Furthermore, existing incremental mining techniques cannot deal with temporal association rules. In this paper, an incremental algorithm to maintain the temporal association rules in a transaction database is proposed. The algorithm benefits from the results of earlier mining to derive the final mining output. The experimental results on both the synthetic and the real dataset illustrate a significant improvement over the conventional approach of mining the entire updated database. }
}
@article{Moeyersoms201580,
  title = {Comprehensible software fault and effort prediction: A data mining approach },
  journal = {Journal of Systems and Software },
  volume = {100},
  number = {0},
  pages = {80 - 90},
  year = {2015},
  note = {},
  issn = {0164-1212},
  doi = {http://dx.doi.org/10.1016/j.jss.2014.10.032},
  url = {http://www.sciencedirect.com/science/article/pii/S0164121214002295},
  author = {Julie Moeyersoms and Enric Junqué de Fortuny and Karel Dejaeger and Bart Baesens and David Martens},
  keywords = {Rule extraction},
  keywords = {Software fault and effort prediction},
  keywords = {Comprehensibility },
  abstract = {Abstract Software fault and effort prediction are important tasks to minimize costs of a software project. In software effort prediction the aim is to forecast the effort needed to complete a software project, whereas software fault prediction tries to identify fault-prone modules. In this research both tasks are considered, thereby using different data mining techniques. The predictive models not only need to be accurate but also comprehensible, demanding that the user can understand the motivation behind the model's prediction. Unfortunately, to obtain predictive performance, comprehensibility is often sacrificed and vice versa. To overcome this problem, we extract trees from well performing Random Forests (RFs) and Support Vector Machines for regression (SVRs) making use of a rule extraction algorithm ALPA. This method builds trees (using C4.5 and REPTree) that mimic the black-box model (RF, SVR) as closely as possible. The proposed methodology is applied to publicly available datasets, complemented with new datasets that we have put together based on the Android repository. Surprisingly, the trees extracted from the black-box models by \{ALPA\} are not only comprehensible and explain how the black-box model makes (most of) its predictions, but are also more accurate than the trees obtained by working directly on the data. }
}
@article{Zafra201193,
  title = {Multiple Instance Learning with Multiple Objective Genetic Programming for Web Mining },
  journal = {Applied Soft Computing },
  volume = {11},
  number = {1},
  pages = {93 - 102},
  year = {2011},
  note = {},
  issn = {1568-4946},
  doi = {http://dx.doi.org/10.1016/j.asoc.2009.10.021},
  url = {http://www.sciencedirect.com/science/article/pii/S1568494609002129},
  author = {Amelia Zafra and Eva L. Gibaja and Sebastián Ventura},
  keywords = {Multi-instance learning},
  keywords = {Multi-objective learning},
  keywords = {Genetic programming},
  keywords = {Web Mining },
  abstract = {This paper introduces a multi-objective grammar based genetic programming algorithm, MOG3P-MI, to solve a Web Mining problem from the perspective of multiple instance learning. This algorithm is evaluated and compared to other algorithms that were previously used to solve this problem. Computational experiments show that the MOG3P-MI algorithm obtains the best results, adds comprehensibility and clarity to the knowledge discovery process and overcomes the main drawbacks of previous techniques obtaining solutions which maintain a balance between conflicting measurements like sensitivity and specificity. }
}
@article{Liang20111388,
  title = {The Chinese Unknown Term Translation Mining with Supervised Candidate Term Extraction Strategy },
  journal = {Procedia Engineering },
  volume = {15},
  number = {0},
  pages = {1388 - 1392},
  year = {2011},
  note = {\{CEIS\} 2011 },
  issn = {1877-7058},
  doi = {http://dx.doi.org/10.1016/j.proeng.2011.08.257},
  url = {http://www.sciencedirect.com/science/article/pii/S1877705811017589},
  author = {Ying-Hong Liang and Jin-xiang Li and Liang Ye and Ke Chen and Cui-zhen Guo},
  keywords = {\{OOV\} term},
  keywords = {Translation mining},
  keywords = {Headword expansion },
  abstract = {Most researchers extracted candidate term using unsupervised method. In this paper, a supervised candidate term extraction method is proposed. It combines English Part of Speech and headword expansion chunking strategy. Firstly, it retrieves bilingual snippets from web by term expansion, then, crawls Chinese-English pages and screens out Chinese words from the Chinese-English pages, lastly, a headword expansion chunking strategy is used to identify English phrases and the English Noun phrases and Verb phrases are selected. These selected English phrases serve as the last candidate term for term translation mining. Experimental results show that the supervised candidate term extraction method improves the top 10 inclusion rate by 1.6% than baseline system, which verifies that the supervised candidate term extraction method is effective. }
}
@article{Kumar2010451,
  title = {Allele mining in crops: Prospects and potentials },
  journal = {Biotechnology Advances },
  volume = {28},
  number = {4},
  pages = {451 - 461},
  year = {2010},
  note = {},
  issn = {0734-9750},
  doi = {http://dx.doi.org/10.1016/j.biotechadv.2010.02.007},
  url = {http://www.sciencedirect.com/science/article/pii/S0734975010000273},
  author = {G. Ram Kumar and K. Sakthivel and R.M. Sundaram and C.N. Neeraja and S.M. Balachandran and N. Shobha Rani and B.C. Viraktamath and M.S. Madhav},
  keywords = {Allele mining},
  keywords = {Promoter mining},
  keywords = {Allelic variation},
  keywords = {Crop gene pool},
  keywords = {EcoTilling },
  abstract = {Enormous sequence information is available in public databases as a result of sequencing of diverse crop genomes. It is important to use this genomic information for the identification and isolation of novel and superior alleles of agronomically important genes from crop gene pools to suitably deploy for the development of improved cultivars. Allele mining is a promising approach to dissect naturally occurring allelic variation at candidate genes controlling key agronomic traits which has potential applications in crop improvement programs. It helps in tracing the evolution of alleles, identification of new haplotypes and development of allele-specific markers for use in marker-assisted selection. Realizing the immense potential of allele mining, concerted allele mining efforts are underway in many international crop research institutes. This review examines the concepts, approaches and applications of allele mining along with the challenges associated while emphasizing the need for more refined ‘mining’ strategies for accelerating the process of allele discovery and its utilization in molecular breeding. }
}
@article{Niu20111726,
  title = {Second-order Mining for Active Collaborative Filtering },
  journal = {Procedia Computer Science },
  volume = {4},
  number = {0},
  pages = {1726 - 1734},
  year = {2011},
  note = {Proceedings of the International Conference on Computational Science, \{ICCS\} 2011 },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2011.04.187},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050911002456},
  author = {Lingfeng Niu and Jianmin Wu and Yong Shi},
  keywords = {Second-order Mining},
  keywords = {Active Learning},
  keywords = {Collaborative Filtering },
  abstract = {Active learning for collaborative filtering tasks draws many attentions from the research community. It can capture the user's interest with greatly reduced labeling burden for the online user. High quality recommendation can thus be made with good user experience. In this paper we address the efficiency challenge of current active learning methods for online and interactive applications by using the second-order mining techniques. According to the global latent semantic model learnt from the feedbacks of historical users to items, we propose an intuitive and efficient query strategy for the item selection for new active user. The time complexity in each query is reduced greatly to constant O(1). Experimental results on the public available data sets show the efficiency and effectiveness of our method. }
}
@article{Tseng20099524,
  title = {Effective temporal data classification by integrating sequential pattern mining and probabilistic induction },
  journal = {Expert Systems with Applications },
  volume = {36},
  number = {5},
  pages = {9524 - 9532},
  year = {2009},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2008.10.077},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417408007884},
  author = {Vincent S. Tseng and Chao-Hui Lee},
  keywords = {Temporal data},
  keywords = {Classification},
  keywords = {Sequential pattern},
  keywords = {Scoring method},
  keywords = {Data mining },
  abstract = {Data classification is an important topic in the field of data mining due to its wide applications. A number of related methods have been proposed based on the well-known learning models such as decision tree or neural network. Although data classification was widely discussed, relatively few studies explored the topic of temporal data classification. Most of the existing researches focused on improving the accuracy of classification by using statistical models, neural network, or distance-based methods. However, they cannot interpret the results of classification to users. In many research cases, such as gene expression of microarray, users prefer the classification information above a classifier only with a high accuracy. In this paper, we propose a novel pattern-based data mining method, namely classify-by-sequence (CBS), for classifying large temporal datasets. The main methodology behind the \{CBS\} is integrating sequential pattern mining with probabilistic induction. The \{CBS\} has the merit of simplicity in implementation and its pattern-based architecture can supply clear classification information to users. Through experimental evaluation, the \{CBS\} was shown to deliver classification results with high accuracy under two real time series datasets. In addition, we designed a simulator to evaluate the performance of \{CBS\} under datasets with different characteristics. The experimental results show that \{CBS\} can discover the hidden patterns and classify data effectively by utilizing the mined sequential patterns. }
}
@article{Uçar20111404,
  title = {Predicting existence of Mycobacterium tuberculosis on patients using data mining approaches },
  journal = {Procedia Computer Science },
  volume = {3},
  number = {0},
  pages = {1404 - 1411},
  year = {2011},
  note = {World Conference on Information Technology },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2011.01.022},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050911000238},
  author = {Tamer Uçar and Adem Karahoca},
  keywords = {Tuberculosis},
  keywords = {\{ANFIS\}},
  keywords = {Multilayer perceptron},
  keywords = {\{PART\}},
  keywords = {Data mining },
  abstract = {A correct diagnosis of tuberculosis (TB) can be only stated by applying a medical test to patient’s phlegm. The result of this test is obtained after a time period of about 45 days. The purpose of this study is to develop a data mining(DM) solution which makes diagnosis of tuberculosis as accurate as possible and helps deciding if it is reasonable to start tuberculosis treatment on suspected patients without waiting the exact medical test results or not. In this research, we proposed the use of Sugeno-type “adaptive-network-based fuzzy inference system” (ANFIS) to predict the existence of mycobacterium tuberculosis. 667 different patient records which are obtained from a clinic are used in the entire process of this research. Each of the patient records consist of 30 separate input parameters. \{ANFIS\} model is generated by using 500 of those records. We also implemented a multilayer perceptron and \{PART\} model using the same data set. The \{ANFIS\} model classifies the instances with an \{RMSE\} of 18% whereas Multilayer Perceptron does the same classification with an \{RMSE\} of % 19 and \{PART\} algorithm with an \{RMSE\} of % 20. \{ANFIS\} is an accurate and reliable method when compared with Multilayer Perceptron and \{PART\} algorithms for classification of tuberculosis patients. This study has contribution on forecasting patients before the medical tests. }
}
@article{Secretan2010891,
  title = {APHID: An architecture for private, high-performance integrated data mining },
  journal = {Future Generation Computer Systems },
  volume = {26},
  number = {7},
  pages = {891 - 904},
  year = {2010},
  note = {},
  issn = {0167-739X},
  doi = {http://dx.doi.org/10.1016/j.future.2010.02.017},
  url = {http://www.sciencedirect.com/science/article/pii/S0167739X10000336},
  author = {Jimmy Secretan and Michael Georgiopoulos and Anna Koufakou and Kel Cardona},
  keywords = {Data mining},
  keywords = {Privacy},
  keywords = {Distributed architectures },
  abstract = {While the emerging field of privacy preserving data mining (PPDM) will enable many new data mining applications, it suffers from several practical difficulties. \{PPDM\} algorithms are challenging to develop and computationally intensive to execute. Developers need convenient abstractions to simplify the engineering of \{PPDM\} applications. The individual parties involved in the data mining process need a way to bring high-performance, parallel computers to bear on the computationally intensive parts of the \{PPDM\} tasks. This paper discusses \{APHID\} (Architecture for Private and High-performance Integrated Data mining), a practical architecture and software framework for developing and executing large scale \{PPDM\} applications. At one tier, the system supports simplified use of cluster and grid resources, and at another tier, the system abstracts communication for easy \{PPDM\} algorithm development. This paper offers a detailed analysis of the challenges in developing \{PPDM\} algorithms with existing frameworks, and motivates the design of a new infrastructure based on these challenges. }
}
@article{Anzanello2011139,
  title = {Data mining-based method for identifying discriminant attributes in sensory profiling },
  journal = {Food Quality and Preference },
  volume = {22},
  number = {1},
  pages = {139 - 148},
  year = {2011},
  note = {},
  issn = {0950-3293},
  doi = {http://dx.doi.org/10.1016/j.foodqual.2010.08.010},
  url = {http://www.sciencedirect.com/science/article/pii/S0950329310001552},
  author = {Michel J. Anzanello and Flavio S. Fogliatto and Karina Rossini},
  keywords = {Attribute selection},
  keywords = {Discriminant attributes},
  keywords = {Sample classification},
  keywords = {Data mining tools },
  abstract = {Selection of attributes from a group of candidates to be assessed through sensory analysis is an important issue when planning sensory panels. In attribute selection it is desirable to reduce the list of those to be presented to panelists to avoid fatigue, minimize costs and save time. In some applications the goal is to keep attributes that are relevant and non-redundant in the sensory characterization of products. In this paper, however, we are interested in keeping attributes that best discriminate between products. For that we present a data mining-based method for attribute selection in descriptive sensory panels, such as those used in the Quantitative Descriptive Analysis. The proposed method is implemented using Principal Component Analysis and the k-Nearest Neighbor classification technique, in conjunction with Pareto Optimal analysis. Objectives are (i) to identity the set of attributes that best discriminate samples analyzed in the panel, and (ii) to indicate the group of panelists that provide consistent evaluations. The method is illustrated through a case study where beef cubes in stew, used as combat ration by the American Army, are characterized in sensory panels using the Spectrum protocol. }
}
@article{Zhou2010650,
  title = {Text mining for traditional Chinese medical knowledge discovery: A survey },
  journal = {Journal of Biomedical Informatics },
  volume = {43},
  number = {4},
  pages = {650 - 660},
  year = {2010},
  note = {},
  issn = {1532-0464},
  doi = {http://dx.doi.org/10.1016/j.jbi.2010.01.002},
  url = {http://www.sciencedirect.com/science/article/pii/S1532046410000031},
  author = {Xuezhong Zhou and Yonghong Peng and Baoyan Liu},
  keywords = {Text mining},
  keywords = {Traditional Chinese medicine},
  keywords = {Review },
  abstract = {Extracting meaningful information and knowledge from free text is the subject of considerable research interest in the machine learning and data mining fields. Text data mining (or text mining) has become one of the most active research sub-fields in data mining. Significant developments in the area of biomedical text mining during the past years have demonstrated its great promise for supporting scientists in developing novel hypotheses and new knowledge from the biomedical literature. Traditional Chinese medicine (TCM) provides a distinct methodology with which to view human life. It is one of the most complete and distinguished traditional medicines with a history of several thousand years of studying and practicing the diagnosis and treatment of human disease. It has been shown that the \{TCM\} knowledge obtained from clinical practice has become a significant complementary source of information for modern biomedical sciences. \{TCM\} literature obtained from the historical period and from modern clinical studies has recently been transformed into digital data in the form of relational databases or text documents, which provide an effective platform for information sharing and retrieval. This motivates and facilitates research and development into knowledge discovery approaches and to modernize TCM. In order to contribute to this still growing field, this paper presents (1) a comparative introduction to \{TCM\} and modern biomedicine, (2) a survey of the related information sources of TCM, (3) a review and discussion of the state of the art and the development of text mining techniques with applications to TCM, (4) a discussion of the research issues around \{TCM\} text mining and its future directions. }
}
@incollection{Parra201132,
  title = {2 - Robots for non-conventional de-mining processes: From remote control to autonomy },
  editor = {Baudoin, Y.  and Habib, Maki K. },
  booktitle = {Using Robots in Hazardous Environments },
  publisher = {Woodhead Publishing},
  edition = {},
  address = {},
  year = {2011},
  pages = {32 - 62},
  isbn = {978-1-84569-786-0},
  doi = {http://dx.doi.org/10.1533/9780857090201.1.32},
  url = {http://www.sciencedirect.com/science/article/pii/B9781845697860500022},
  author = {C. Parra and C. Otalora and A. Forero and M. Devy},
  keywords = {robotics},
  keywords = {de-mining},
  keywords = {sensor},
  keywords = {data fusion },
  abstract = {Abstract: This chapter aims to show the results of the perceptual strategy of a global project being developed at Bogota in Colombia by Pontificia Universidad Javeriana and at Toulouse in France by LAAS/CNRS. This cooperation project, funded by the Colombian-French ECOS-NORD program, includes an investigation about mine sensing technologies, path planning and robotic platforms among others. The chapter gives the general context and what are the main challenges when coping with humanitarian demining, presents a method based on the analysis of multisensory fused data to improve the landmine detection and its implementation on an embedded system and presents two robotics demining platforms called \{URSULA\} and AMARANTA. These robots have been designed and built as mine hunting platforms to be used in developing countries: so these platforms could be cheap and easy-to-use solutions for humanitarian demining. The robots contain the perceptual capabilities described earlier in this chapter. }
}
@article{Lee2009155,
  title = {Mining temporal interval relational rules from temporal data },
  journal = {Journal of Systems and Software },
  volume = {82},
  number = {1},
  pages = {155 - 167},
  year = {2009},
  note = {Special Issue: Software Performance - Modeling and Analysis },
  issn = {0164-1212},
  doi = {http://dx.doi.org/10.1016/j.jss.2008.07.037},
  url = {http://www.sciencedirect.com/science/article/pii/S0164121208001854},
  author = {Yong Joon Lee and Jun Wook Lee and Duck Jin Chai and Bu Hyun Hwang and Keun Ho Ryu},
  keywords = {Data mining},
  keywords = {Temporal data},
  keywords = {Rule mining},
  keywords = {Interval temporal mining },
  abstract = {Temporal data mining is still one of important research topic since there are application areas that need knowledge from temporal data such as sequential patterns, similar time sequences, cyclic and temporal association rules, and so on. Although there are many studies for temporal data mining, they do not deal with discovering knowledge from temporal interval data such as patient histories, purchaser histories, and web logs etc. We propose a new temporal data mining technique that can extract temporal interval relation rules from temporal interval data by using Allen’s theory: a preprocessing algorithm designed for the generalization of temporal interval data and a temporal relation algorithm for mining temporal relation rules from the generalized temporal interval data. This technique can provide more useful knowledge in comparison with conventional data mining techniques. }
}
@article{Reiner2010975,
  title = {New Strategies for Medical Data Mining, Part 1: Dynamic and Performance-Based Reimbursement },
  journal = {Journal of the American College of Radiology },
  volume = {7},
  number = {12},
  pages = {975 - 979},
  year = {2010},
  note = {},
  issn = {1546-1440},
  doi = {http://dx.doi.org/10.1016/j.jacr.2010.06.007},
  url = {http://www.sciencedirect.com/science/article/pii/S1546144010003248},
  author = {Bruce Reiner},
  keywords = {Reimbursement},
  keywords = {medical economics},
  keywords = {data mining},
  keywords = {evidence-based medicine },
  abstract = {The current professional reimbursement model within medicine was created more than 20 years ago in response to physician dissatisfaction and health care inflationary pressures. Despite many resulting improvements, several deficiencies currently exist within the current reimbursement model, related to transparency, accountability, and quality. As the tenets of evidence-based medicine and pay for performance become ingrained within health care delivery, it would be beneficial to modify the existing reimbursement model to reflect these principles. The opportunity to accomplish this goal is advanced through the continued evolution of information systems technologies and data mining. The author discusses the existing deficiencies in medical reimbursement and makes a number of recommendations for improvement. The ultimate goal is to incorporate objective and standardized data into a transparent and readily accessible database, which can be used to enhance performance, education, and informed decision making. }
}
@article{Benny2015364,
  title = {Keyword Based Tweet Extraction and Detection of Related Topics },
  journal = {Procedia Computer Science },
  volume = {46},
  number = {0},
  pages = {364 - 371},
  year = {2015},
  note = {Proceedings of the International Conference on Information and Communication Technologies, \{ICICT\} 2014, 3-5 December 2014 at Bolgatty Palace & Island Resort, Kochi, India },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2015.02.032},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050915000964},
  author = {Amrutha Benny and Mintu Philip},
  keywords = {Twitter},
  keywords = {Topic extraction},
  keywords = {frequent patterns},
  keywords = {pattern association},
  keywords = {clustering; },
  abstract = {Abstract Twitter is a micro blogging site that helps the transfer of information as short length tweets. The large quantum of information makes it necessary to find out methods and tools to summarize them. Our research work is to propose a method, which collect tweets using a specific keyword and then, summarizes them to find out topics related to that keyword. The topic detection is done by using clusters of frequent patterns. Already existing pattern oriented topic detection techniques suffer from the wrong correlation problem of patterns. In this paper, we propose two algorithms, \{TDA\} (Topic Detection using AGF) and \{TCTR\} (Topic Clustering and Tweet Retrieval), which will help to overcome this problem. From various experimental results, it is observed that the proposed method can maintain good performance irrespective of the size of the data set. }
}
@article{Chen20101208,
  title = {An integration of WordNet and fuzzy association rule mining for multi-label document clustering },
  journal = {Data & Knowledge Engineering },
  volume = {69},
  number = {11},
  pages = {1208 - 1226},
  year = {2010},
  note = {Special issue on contribution of ontologies in designing advanced information systems },
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2010.08.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X10000972},
  author = {Chun-Ling Chen and Frank S.C. Tseng and Tyne Liang},
  keywords = {Fuzzy association rule mining},
  keywords = {Text mining},
  keywords = {Document clustering},
  keywords = {WordNet},
  keywords = {Frequent itemsets },
  abstract = {With the rapid growth of text documents, document clustering has become one of the main techniques for organizing large amount of documents into a small number of meaningful clusters. However, there still exist several challenges for document clustering, such as high dimensionality, scalability, accuracy, meaningful cluster labels, overlapping clusters, and extracting semantics from texts. In order to improve the quality of document clustering results, we propose an effective Fuzzy-based Multi-label Document Clustering (FMDC) approach that integrates fuzzy association rule mining with an existing ontology WordNet to alleviate these problems. In our approach, the key terms will be extracted from the document set, and the initial representation of all documents is further enriched by using hypernyms of WordNet in order to exploit the semantic relations between terms. Then, a fuzzy association rule mining algorithm for texts is employed to discover a set of highly-related fuzzy frequent itemsets, which contain key terms to be regarded as the labels of the candidate clusters. Finally, each document is dispatched into more than one target cluster by referring to these candidate clusters, and then the highly similar target clusters are merged. We conducted experiments to evaluate the performance based on Classic, Re0, R8, and WebKB datasets. The experimental results proved that our approach outperforms the influential document clustering methods with higher accuracy. Therefore, our approach not only provides more general and meaningful labels for documents, but also effectively generates overlapping clusters. }
}
@article{Kuo2015958,
  title = {The integration of association rule mining and artificial immune network for supplier selection and order quantity allocation },
  journal = {Applied Mathematics and Computation },
  volume = {250},
  number = {0},
  pages = {958 - 972},
  year = {2015},
  note = {},
  issn = {0096-3003},
  doi = {http://dx.doi.org/10.1016/j.amc.2014.11.015},
  url = {http://www.sciencedirect.com/science/article/pii/S0096300314015215},
  author = {R.J. Kuo and C.M. Pai and R.H. Lin and H.C. Chu},
  keywords = {Supplier selection},
  keywords = {Order quantity allocation},
  keywords = {TD-FP-growth algorithm},
  keywords = {Optimization artificial immune network},
  keywords = {Particle swarm optimization },
  abstract = {Abstract This study firstly uses one of the association rule mining techniques, a TD-FP-growth algorithm, to select the important suppliers from the existing suppliers and determine the importance of each supplier. A hybrid artificial immune network (Opt-aiNet) and particle swarm optimization (PSO) (aiNet-PSO) is then proposed to allocate the order quantity for the key suppliers at minimum cost. In order to verify the proposed method, a case company’s daily purchasing ledger is used, with emphasis on the consumer electronic product manufacturers. The computational results indicate that the TD-FP-growth algorithm can select the key suppliers using the historical data. The proposed hybrid method also provides a cheaper solution than a genetic algorithm, particle swam optimization, or an artificial immune system. }
}
@article{Suh20107255,
  title = {Applying text and data mining techniques to forecasting the trend of petitions filed to e-People },
  journal = {Expert Systems with Applications },
  volume = {37},
  number = {10},
  pages = {7255 - 7268},
  year = {2010},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2010.04.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417410002733},
  author = {Jong Hwan Suh and Chung Hoon Park and Si Hyun Jeon},
  keywords = {Text mining},
  keywords = {Data mining},
  keywords = {Petition},
  keywords = {Keyword extracting},
  keywords = {Document clustering},
  keywords = {Forecasting},
  keywords = {e-Government},
  keywords = {Open Innovation},
  keywords = {e-People },
  abstract = {As the Internet has been the virtual place where citizens are united and their opinions are promptly shifted into the action, two way communications between the government sector and the citizen have been more important among activities of e-Government. Hence, Anti-corruption and Civil Rights Commission (ACRC) in the Republic of Korea has constructed the online petition portal system named e-People. In addition, the nation’s Open Innovation through e-People has gained increasing attention. That is because e-People can be applied for the virtual space where citizens participate in improving the national law and policy by simply filing petitions to e-People as the voice of the nation. However, currently there are problems and challenging issues to be solved until e-People can function as the virtual space for the nation’s Open Innovation based on petitions collected from citizens. First, there is no objective and systematic method for analyzing a large number of petitions filed to e-People without a lot of manual works of petition inspectors. Second, e-People is required to forecast the trend of petitions filed to e-People more accurately and quickly than petition inspectors for making a better decision on the national law and policy strategy. Therefore, in this paper, we propose the framework of applying text and data mining techniques not only to analyze a large number of petitions filed to e-People but also to predict the trend of petitions. In detail, we apply text mining techniques to unstructured data of petitions to elicit keywords from petitions and identify groups of petitions with the elicited keywords. Moreover, we apply data mining techniques to structured data of the identified petition groups on purpose to forecast the trend of petitions. Our approach based on applying text and data mining techniques decreases time-consuming manual works on reading and classifying a large number of petitions, and contributes to increasing accuracy in evaluating the trend of petitions. Eventually, it helps petition inspectors to give more attention on detecting and tracking important groups of petitions that possibly grow as nationwide problems. Further, the petitions ordered by their petition groups’ trend values can be used as the baseline for making a better decision on the national law and policy strategy. }
}
@article{Villalba20151022,
  title = {servIoTicy and iServe: A Scalable Platform for Mining the IoT },
  journal = {Procedia Computer Science },
  volume = {52},
  number = {0},
  pages = {1022 - 1027},
  year = {2015},
  note = {The 6th International Conference on Ambient Systems, Networks and Technologies (ANT-2015), the 5th International Conference on Sustainable Energy Information Technology (SEIT-2015) },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2015.05.097},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050915008972},
  author = {Álvaro Villalba and Juan Luis Pérez and David Carrera and Carlos Pedrinaci and Luca Panziera},
  keywords = {Internet of Things},
  keywords = {IoT},
  keywords = {Big Data},
  keywords = {Analytics},
  keywords = {Stream Processing},
  keywords = {Semantic Search},
  keywords = {Service Discovery },
  abstract = {Abstract In the last years, Internet of Things (IoT) and Big Data platforms are clearly converging in terms of technologies, problems and approaches. IoT ecosystems generate a vast amount of data that needs to be stored and processed, becoming a Big Data problem. In this paper we present a platform that is specifically designed for mining the information associated to the IoT, including both sensors data and meta-data. The platform is composed of two major components: servIoTicy for storing and processing data, and iServe for the publication and discovery of sensors meta-data. The former provides capabilities to ingest, transform on real time and query data generated by sensors; the latter provides capabilities to publish, discover and use sensors based on semantic information associated to them. Both components are clearly designed for scalability, as any IoT cloud deployment requires. Both servIoTicy and iServe are available as an open source projects. }
}
@article{Li20105895,
  title = {Predicting business failure using classification and regression tree: An empirical comparison with popular classical statistical methods and top classification mining methods },
  journal = {Expert Systems with Applications },
  volume = {37},
  number = {8},
  pages = {5895 - 5904},
  year = {2010},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2010.02.016},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417410000552},
  author = {Hui Li and Jie Sun and Jian Wu},
  keywords = {Business failure prediction (BFP)},
  keywords = {Data mining},
  keywords = {Classification and regression tree (CART) },
  abstract = {Predicting business failure is a very critical task for government officials, stock holders, managers, employees, investors and researchers, especially in nowadays competitive economic environment. Several top 10 data mining methods have become very popular alternatives in business failure prediction (BFP), e.g., support vector machine and k nearest neighbor. In comparison with the other classification mining methods, advantages of classification and regression tree (CART) methods include: simplicity of results, easy implementation, nonlinear estimation, being non-parametric, accuracy and stable. However, there are seldom researches in the area of \{BFP\} that witness the applicability of CART, another method among the top 10 algorithms in data mining. The aim of this research is to explore the performance of \{BFP\} using the commonly discussed data mining technique of CART. To demonstrate the effectiveness of \{BFP\} using CART, business failure predicting tasks were performed on the data set collected from companies listed in the Shanghai Stock Exchange and Shenzhen Stock Exchange. Thirty times’ hold-out method was employed as the assessment, and the two commonly used methods in the top 10 data mining algorithms, i.e., support vector machine and k nearest neighbor, and the two baseline benchmark methods from statistic area, i.e., multiple discriminant analysis (MDA) and logistics regression, were employed as comparative methods. For comparative methods, stepwise method of \{MDA\} was employed to select optimal feature subset. Empirical results indicated that the optimal algorithm of \{CART\} outperforms all the comparative methods in terms of predictive performance and significance test in short-term \{BFP\} of Chinese listed companies. }
}
@article{Vossen2012288,
  title = {The Process Mining Manifesto—An interview with Wil van der Aalst },
  journal = {Information Systems },
  volume = {37},
  number = {3},
  pages = {288 - 290},
  year = {2012},
  note = {},
  issn = {0306-4379},
  doi = {http://dx.doi.org/10.1016/j.is.2011.10.006},
  url = {http://www.sciencedirect.com/science/article/pii/S0306437911001463},
  author = {Gottfried Vossen},
  abstract = {The \{IEEE\} Task Force on Process Mining has recently published its Process Mining Manifesto (PMM) in an effort to promote the topic of process mining. As this topic touches a number of areas in computer science, the editors of Information Systems have decided to conduct an interview with the person in charge of the task force, Prof. Wil van der Aalst of Eindhoven University of Technology (TU/e) in the Netherlands }
}
@article{Matatov20102696,
  title = {Privacy-preserving data mining: A feature set partitioning approach },
  journal = {Information Sciences },
  volume = {180},
  number = {14},
  pages = {2696 - 2720},
  year = {2010},
  note = {Including Special Section on Hybrid Intelligent Algorithms and Applications },
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2010.03.011},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025510001234},
  author = {Nissim Matatov and Lior Rokach and Oded Maimon},
  keywords = {Data mining},
  keywords = {Privacy},
  keywords = {Genetic algorithms},
  keywords = {k-Anonymity},
  keywords = {Feature set partitioning },
  abstract = {In privacy-preserving data mining (PPDM), a widely used method for achieving data mining goals while preserving privacy is based on k-anonymity. This method, which protects subject-specific sensitive data by anonymizing it before it is released for data mining, demands that every tuple in the released table should be indistinguishable from no fewer than k subjects. The most common approach for achieving compliance with k-anonymity is to replace certain values with less specific but semantically consistent values. In this paper we propose a different approach for achieving k-anonymity by partitioning the original dataset into several projections such that each one of them adheres to k-anonymity. Moreover, any attempt to rejoin the projections, results in a table that still complies with k-anonymity. A classifier is trained on each projection and subsequently, an unlabelled instance is classified by combining the classifications of all classifiers. Guided by classification accuracy and k-anonymity constraints, the proposed data mining privacy by decomposition (DMPD) algorithm uses a genetic algorithm to search for optimal feature set partitioning. Ten separate datasets were evaluated with \{DMPD\} in order to compare its classification performance with other k-anonymity-based methods. The results suggest that \{DMPD\} performs better than existing k-anonymity-based algorithms and there is no necessity for applying domain dependent knowledge. Using multiobjective optimization methods, we also examine the tradeoff between the two conflicting objectives in PPDM: privacy and predictive performance. }
}
@article{Alkhattabi2011862,
  title = {Assessing information quality of e-learning systems: a web mining approach },
  journal = {Computers in Human Behavior },
  volume = {27},
  number = {2},
  pages = {862 - 873},
  year = {2011},
  note = {Web 2.0 in Travel and Tourism: Empowering and Changing the Role of Travelers },
  issn = {0747-5632},
  doi = {http://dx.doi.org/10.1016/j.chb.2010.11.011},
  url = {http://www.sciencedirect.com/science/article/pii/S0747563210003547},
  author = {Mona Alkhattabi and Daniel Neagu and Andrea Cullen},
  keywords = {E-learning},
  keywords = {Information quality},
  keywords = {Information system},
  keywords = {Quality framework},
  keywords = {Quality metrics},
  keywords = {Web mining },
  abstract = {E-learning systems provide a promising solution as an information exchanging channel. Improved technologies could mean faster and easier access to information but do not necessarily ensure the quality of this information; for this reason it is essential to develop valid and reliable methods of quality measurement and carry out careful information quality evaluations. This paper proposes an assessment model for information quality in e-learning systems based on the quality framework we proposed previously: the proposed framework consists of 14 quality dimensions grouped in three quality factors: intrinsic, contextual representation and accessibility. We use the relative importance as a parameter in a linear equation for the measurement scheme. Formerly, we implemented a goal-question-metrics approach to develop a set of quality metrics for the identified quality attributes within the proposed framework. In this paper, the proposed metrics were computed to produce a numerical rating indicating the overall information quality published in a particular e-learning system. The data collection and evaluation processes were automated using a web data extraction technique and results on a case study are discussed. This assessment model could be useful to e-learning systems designers, providers and users as it provides a comprehensive indication of the quality of information in such systems. }
}
@article{Trabelsi2015,
  title = {Extraction and clustering of arguing expressions in contentious text },
  journal = {Data & Knowledge Engineering },
  volume = {nil},
  number = {0},
  pages = { - },
  year = {2015},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2015.05.004},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X15000324},
  author = {Amine Trabelsi and Osmar R. Zaïane},
  keywords = {Contention analysis},
  keywords = {Topic models},
  keywords = {Arguing expression detection},
  keywords = {Opinion mining},
  keywords = {Unsupervised clustering},
  keywords = {Online debates },
  abstract = {Abstract This work proposes an unsupervised method intended to enhance the quality of opinion mining in contentious text. It presents a Joint Topic Viewpoint (JTV) probabilistic model to analyze the underlying divergent arguing expressions that may be present in a collection of contentious documents. The conceived \{JTV\} has the potential of automatically carrying the tasks of extracting associated terms denoting an arguing expression, according to the hidden topics it discusses and the embedded viewpoint it voices. Furthermore, JTV's structure enables the unsupervised grouping of obtained arguing expressions according to their viewpoints, using a proposed constrained clustering algorithm which is an adapted version of the constrained k-means clustering (COP-KMEANS). Experiments are conducted on three types of contentious documents (polls, online debates and editorials), through six different contentious data sets. Quantitative evaluations of the topic modeling output, as well as the constrained clustering results show the effectiveness of the proposed method to fit the data and generate distinctive patterns of arguing expressions. Moreover, it empirically demonstrates a better clustering of arguing expressions over state-of-the art and baseline methods. The qualitative analysis highlights the coherence of clustered arguing expressions of the same viewpoint and the divergence of opposing ones. }
}
@article{Debreceny2010157,
  title = {Data mining journal entries for fraud detection: An exploratory study },
  journal = {International Journal of Accounting Information Systems },
  volume = {11},
  number = {3},
  pages = {157 - 181},
  year = {2010},
  note = {2009 Research Symposium on Information Integrity & Information Systems Assurance },
  issn = {1467-0895},
  doi = {http://dx.doi.org/10.1016/j.accinf.2010.08.001},
  url = {http://www.sciencedirect.com/science/article/pii/S1467089510000540},
  author = {Roger S. Debreceny and Glen L. Gray},
  keywords = {Fraud},
  keywords = {Journal entries},
  keywords = {Data mining},
  keywords = {Auditing},
  keywords = {Accounting information systems},
  keywords = {\{XBRL\} \{GL\} },
  abstract = {Fraud detection has become a critical component of financial audits and audit standards have heightened emphasis on journal entries as part of fraud detection. This paper canvasses perspectives on applying data mining techniques to journal entries. In the past, the impediment to researching journal entry data mining is getting access to journal entry data sets, which may explain why the published research in this area is a null set. For this project, we had access to journal entry data sets for 29 different organizations. Our initial exploratory test of the data sets had interesting preliminary findings. (1) For all 29 entities, the distribution of first digits of journal dollar amounts differed from that expected by Benford's Law. (2) Regarding last digits, unlike first digits, which are expected to have a logarithmic distribution, the last digits would be expected to have a uniform distribution. Our test found that the distribution was not uniform for many of the entities. In fact, eight entities had one number whose frequency was three times more than expected. (3) We compared the number of accounts related to the top five most frequently occurring three last digit combinations. Four entities had a very high occurrences of the most frequent three digit combinations that involved only a small set of accounts, one entity had a low occurrences of the most frequent three digit combination that involved a large set of accounts and 24 had a low occurrences of the most frequent three digit combinations that involved a small set of accounts. In general, the first four entities would probably pose the highest risk of fraud because it could indicate that the fraudster is covering up or falsifying a particular class of transactions. In the future, we will apply more data mining techniques to discover other patterns and relationships in the data sets. We also want to seed the dataset with fraud indicators (e.g., pairs of accounts that would not be expected in a journal entry) and compare the sensitivity of the different data mining techniques to find these seeded indicators. }
}
@article{Lin20104560,
  title = {Linguistic data mining with fuzzy FP-trees },
  journal = {Expert Systems with Applications },
  volume = {37},
  number = {6},
  pages = {4560 - 4567},
  year = {2010},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2009.12.052},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417409010987},
  author = {Chun-Wei Lin and Tzung-Pei Hong and Wen-Hsiang Lu},
  keywords = {Fuzzy data mining},
  keywords = {Fuzzy-set},
  keywords = {Quantitative value},
  keywords = {Fuzzy FP-trees},
  keywords = {Frequent fuzzy patterns },
  abstract = {Due to the increasing occurrence of very large databases, mining useful information and knowledge from transactions is evolving into an important research area. In the past, many algorithms were proposed for mining association rules, most of which were based on items with binary values. Transactions with quantitative values are, however, commonly seen in real-world applications. In this paper, the frequent fuzzy pattern tree (fuzzy FP-tree) is proposed for extracting frequent fuzzy itemsets from the transactions with quantitative values. When extending the FP-tree to handle fuzzy data, the processing becomes much more complex than the original since fuzzy intersection in each transaction has to be handled. The fuzzy FP-tree construction algorithm is thus designed, and the mining process based on the tree is presented. Experimental results on three different numbers of fuzzy regions also show the performance of the proposed approach. }
}
@article{Subramaniyaswamy2015447,
  title = {Intelligent Travel Recommendation System by Mining Attributes from Community Contributed Photos },
  journal = {Procedia Computer Science },
  volume = {50},
  number = {0},
  pages = {447 - 455},
  year = {2015},
  note = {Big Data, Cloud and Computing Challenges },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2015.04.014},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050915005153},
  author = {V. Subramaniyaswamy and V. Vijayakumar and R. Logesh and V. Indragandhi},
  keywords = {Personalized Travel Recommendation},
  keywords = {geo-tagged photos},
  keywords = {route planning},
  keywords = {Adaboost},
  keywords = {Naïve Bayesian Modeling },
  abstract = {Abstract This paper purposes a system which helps user in finding tourist locations that he/she might likes to visit a place from available user contributed photos of that place available on photo sharing websites. This paper describes methods used to mine demographic information and provide travel recommendation to users. This paper also describes an algorithm adaboost to classify data and Bayesian Learning model for predicting desired location to a user based on his/her preferences. }
}
@article{Bing20151,
  title = {Adaptive Concept Resolution for document representation and its applications in text mining },
  journal = {Knowledge-Based Systems },
  volume = {74},
  number = {0},
  pages = {1 - 13},
  year = {2015},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2014.10.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705114003700},
  author = {Lidong Bing and Shan Jiang and Wai Lam and Yan Zhang and Shoaib Jameel},
  keywords = {Adaptive Concept Resolution},
  keywords = {Ontology},
  keywords = {WordNet},
  keywords = {Wikipedia},
  keywords = {WordNet-Plus },
  abstract = {Abstract It is well-known that synonymous and polysemous terms often bring in some noise when we calculate the similarity between documents. Existing ontology-based document representation methods are static so that the selected semantic concepts for representing a document have a fixed resolution. Therefore, they are not adaptable to the characteristics of document collection and the text mining problem in hand. We propose an Adaptive Concept Resolution (ACR) model to overcome this problem. \{ACR\} can learn a concept border from an ontology taking into the consideration of the characteristics of the particular document collection. Then, this border provides a tailor-made semantic concept representation for a document coming from the same domain. Another advantage of \{ACR\} is that it is applicable in both classification task where the groups are given in the training document set and clustering task where no group information is available. The experimental results show that \{ACR\} outperforms an existing static method in almost all cases. We also present a method to integrate Wikipedia entities into an expert-edited ontology, namely WordNet, to generate an enhanced ontology named WordNet-Plus, and its performance is also examined under the \{ACR\} model. Due to the high coverage, WordNet-Plus can outperform WordNet on data sets having more fresh documents in classification. }
}
@article{Kim201587,
  title = {Is Korean -(n)un a topic marker? On the nature of -(n)un and its relation to information structure },
  journal = {Lingua },
  volume = {154},
  number = {0},
  pages = {87 - 109},
  year = {2015},
  note = {},
  issn = {0024-3841},
  doi = {http://dx.doi.org/10.1016/j.lingua.2014.11.010},
  url = {http://www.sciencedirect.com/science/article/pii/S0024384114002769},
  author = {Ilkyu Kim},
  keywords = {Information structure},
  keywords = {Topic},
  keywords = {Contrast},
  keywords = {Discourse salience},
  keywords = {Korean -(n)un },
  abstract = {Abstract Basic categories of information structure (e.g. topic, focus, contrast) are known to be crosslinguistically expressed by various linguistic devices such as special intonation contours, syntactic mechanisms, and morphological markers. However, the nature of the relation between the categories and their linguistic “markers” has been rarely discussed. To be more specific, despite the rich literature on information structure, whether the categories are directly or indirectly related to their markers has not been of much interest to linguists until quite recently. The main purpose of this paper is to unveil the nature of the relation between Korean -(n)un and the information-structural notions related to it, namely, topic and contrast. Based on a corpus study, it will be claimed that -(n)un is not a topic/contrast marker per se but its function is to impose salience on a discourse referent. Topicality and contrast, widely assumed to be directly marked by -(n)un, will be claimed to be only derived from the interaction of the proposed meaning of -(n)un and various syntactic, semantic, and pragmatic factors. Consequently, this paper provides a strong support for recent attempts to show that the information-structural categories are merely pragmatic effects rather than stable and discrete universal primitives. }
}
@article{Hájek201034,
  title = {The \{GUHA\} method and its meaning for data mining },
  journal = {Journal of Computer and System Sciences },
  volume = {76},
  number = {1},
  pages = {34 - 48},
  year = {2010},
  note = {Special Issue on Intelligent Data Analysis },
  issn = {0022-0000},
  doi = {http://dx.doi.org/10.1016/j.jcss.2009.05.004},
  url = {http://www.sciencedirect.com/science/article/pii/S0022000009000439},
  author = {Petr Hájek and Martin Holeňa and Jan Rauch},
  keywords = {\{GUHA\} method},
  keywords = {Data mining},
  keywords = {LISp-Miner},
  keywords = {Fuzzy hypotheses },
  abstract = {The paper presents the history and present state of the \{GUHA\} method, its theoretical foundations and its relation and meaning for data mining. }
}
@article{Jans201017,
  title = {Internal fraud risk reduction: Results of a data mining case study },
  journal = {International Journal of Accounting Information Systems },
  volume = {11},
  number = {1},
  pages = {17 - 41},
  year = {2010},
  note = {},
  issn = {1467-0895},
  doi = {http://dx.doi.org/10.1016/j.accinf.2009.12.004},
  url = {http://www.sciencedirect.com/science/article/pii/S1467089509000645},
  author = {Mieke Jans and Nadine Lybaert and Koen Vanhoof},
  keywords = {Internal fraud},
  keywords = {Data mining},
  keywords = {Risk reduction},
  keywords = {Latent class clustering },
  abstract = {Corporate fraud represents a huge cost to the current economy. Academic literature has demonstrated how data mining techniques can be of value in the fight against fraud. This research has focused on fraud detection, mostly in a context of external fraud. In this paper, we discuss the use of a data mining approach to reduce the risk of internal fraud. Reducing fraud risk involves both detection and prevention. Accordingly, a descriptive data mining strategy is applied as opposed to the widely used prediction data mining techniques in the literature. The results of using a multivariate latent class clustering algorithm to a case company's procurement data suggest that applying this technique in a descriptive data mining approach is useful in assessing the current risk of internal fraud. The same results could not be obtained by applying a univariate analysis. }
}
@article{Li2010354,
  title = {Using text mining and sentiment analysis for online forums hotspot detection and forecast },
  journal = {Decision Support Systems },
  volume = {48},
  number = {2},
  pages = {354 - 368},
  year = {2010},
  note = {},
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/j.dss.2009.09.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0167923609002097},
  author = {Nan Li and Desheng Dash Wu},
  keywords = {Text mining},
  keywords = {Sentiment analysis},
  keywords = {Cluster analysis},
  keywords = {Online sports forums},
  keywords = {Dynamic interacting network analysis},
  keywords = {Hotspot detection},
  keywords = {Machine learning},
  keywords = {Support vector machine },
  abstract = {Text sentiment analysis, also referred to as emotional polarity computation, has become a flourishing frontier in the text mining community. This paper studies online forums hotspot detection and forecast using sentiment analysis and text mining approaches. First, we create an algorithm to automatically analyze the emotional polarity of a text and to obtain a value for each piece of text. Second, this algorithm is combined with K-means clustering and support vector machine (SVM) to develop unsupervised text mining approach. We use the proposed text mining approach to group the forums into various clusters, with the center of each representing a hotspot forum within the current time span. The data sets used in our empirical studies are acquired and formatted from Sina sports forums, which spans a range of 31 different topic forums and 220,053 posts. Experimental results demonstrate that \{SVM\} forecasting achieves highly consistent results with K-means clustering. The top 10 hotspot forums listed by \{SVM\} forecasting resembles 80% of K-means clustering results. Both \{SVM\} and K-means achieve the same results for the top 4 hotspot forums of the year. }
}
@incollection{Baudoin20113,
  title = {1 - Introduction: Mobile robotics systems for humanitarian de-mining and risky interventions },
  editor = {Baudoin, Y.  and Habib, Maki K. },
  booktitle = {Using Robots in Hazardous Environments },
  publisher = {Woodhead Publishing},
  edition = {},
  address = {},
  year = {2011},
  pages = {3 - 31},
  isbn = {978-1-84569-786-0},
  doi = {http://dx.doi.org/10.1533/9780857090201.1.3},
  url = {http://www.sciencedirect.com/science/article/pii/B9781845697860500010},
  author = {Y. Baudoin and M.K. Habib and I. Doroftei},
  keywords = {mobile robotics},
  keywords = {sensor systems},
  keywords = {human–machine-interface},
  keywords = {autonomous navigation},
  keywords = {de-mining techniques },
  abstract = {Abstract: Dirty, dangerous and dull tasks, all of which are found in landmine detection, can be greatly aided by tele-operation. It is very desirable to remove the operator from the vicinity of the landmine and from the repetitive, boring operations that lead to loss of attention and potential injury. Tele-operated platforms naturally support multiple sensors and data fusion, which are necessary for reliable detection. Tele-operation of handheld sensors or multisensor-heads can enhance the detection process by allowing more precise scanning, which is useful for optimization of the signal processing algorithms. This chapter summarizes the technologies and experiences presented during seven \{IARP\} workshops \{HUDEM\} and three \{IARP\} workshops RISE, based on general considerations and illustrated by some contributions of our own laboratory, located at the Royal Military Academy of Brussels, focusing on the detection of unexploded devices and the implementation of mobile robotics systems on minefields. }
}
@article{Quirin2010291,
  title = {Graph-based data mining: A new tool for the analysis and comparison of scientific domains represented as scientograms },
  journal = {Journal of Informetrics },
  volume = {4},
  number = {3},
  pages = {291 - 312},
  year = {2010},
  note = {},
  issn = {1751-1577},
  doi = {http://dx.doi.org/10.1016/j.joi.2010.01.004},
  url = {http://www.sciencedirect.com/science/article/pii/S1751157710000052},
  author = {Arnaud Quirin and Oscar Cordón and Benjamín Vargas-Quesada and Félix de Moya-Anegón},
  keywords = {Domain analysis},
  keywords = {Social networks},
  keywords = {Scientograms},
  keywords = {Graph-based data mining},
  keywords = {Scientogram mining},
  keywords = {Subdue algorithm },
  abstract = {The creation of some kind of representations depicting the current state of Science (or scientograms) is an established and beaten track for many years now. However, if we are concerned with the automatic comparison, analysis and understanding of a set of scientograms, showing for instance the evolution of a scientific domain or a face-to-face comparison of several countries, the task is titanically complex as the amount of data to analyze becomes huge and complex. In this paper, we aim to show that graph-based data mining tools are useful to deal with scientogram analysis. Subdue, the first algorithm proposed in the graph mining area, has been chosen for this purpose. This algorithm has been customized to deal with three different scientogram analysis tasks regarding the evolution of a scientific domain over time, the extraction of the common research categories substructures in the world, and the comparison of scientific domains between different countries. The outcomes obtained in the developed experiments have clearly demonstrated the potential of graph mining tools in scientogram analysis. }
}
@article{Jung2010110,
  title = {Automatic construction of a large-scale situation ontology by mining how-to instructions from the web },
  journal = {Web Semantics: Science, Services and Agents on the World Wide Web },
  volume = {8},
  number = {2–3},
  pages = {110 - 124},
  year = {2010},
  note = {Bridging the Gap—Data Mining and Social Network Analysis for Integrating Semantic Web and Web 2.0 The Future of Knowledge Dissemination: The Elsevier Grand Challenge for the Life Sciences },
  issn = {1570-8268},
  doi = {http://dx.doi.org/10.1016/j.websem.2010.04.006},
  url = {http://www.sciencedirect.com/science/article/pii/S1570826810000302},
  author = {Yuchul Jung and Jihee Ryu and Kyung-min Kim and Sung-Hyon Myaeng},
  keywords = {Automatic ontology construction},
  keywords = {Situation ontology},
  keywords = {Action mining},
  keywords = {How-to instruction},
  keywords = {Service recommendation},
  keywords = {Automatic service composition },
  abstract = {With the growing interests in semantic web services and context-aware computing, the importance of ontologies, which enable us to perform context-aware reasoning, has been accepted widely. While domain-specific and general-purpose ontologies have been developed, few attempts have been made for a situation ontology that can be employed directly to support activity-oriented context-aware services. In this paper, we propose an approach to automatically constructing a large-scale situation ontology by mining large-scale web resources, eHow and wikiHow, which contain an enormous amount of how-to instructions (e.g., “How to install a car amplifier”). The construction process is guided by a situation model derived from the procedural knowledge available in the web resources. Two major steps involved are: (1) action mining that extracts pairs of a verb and its ingredient (i.e., objects, location, and time) from individual instructional steps (e.g., <disconnect, ground cable>) and forms goal-oriented situation cases using the results and (2) normalization and integration of situation cases to form the situation ontology. For validation, we measure accuracy of the action mining method and show how our situation ontology compares in terms of coverage with existing large-scale ontology-like resources constructed manually. Furthermore, we show how it can be utilized for two applications: service recommendation and service composition. }
}
@article{Chen20104496,
  title = {Using data mining techniques to automatically construct concept maps for adaptive learning systems },
  journal = {Expert Systems with Applications },
  volume = {37},
  number = {6},
  pages = {4496 - 4503},
  year = {2010},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2009.12.060},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417409011063},
  author = {Shyi-Ming Chen and Shih-Ming Bai},
  keywords = {Adaptive learning systems},
  keywords = {Apriori algorithm},
  keywords = {Concept maps},
  keywords = {Concepts-relationship analysis},
  keywords = {Data mining },
  abstract = {It is obvious that to construct concept maps for providing the learning guidance to learners is an important research topic of adaptive learning systems. Because the existing method to construct concept maps only considers the association rules that questions are not correctly answered, it will miss some information about questions that are correctly answered by the learners. Moreover, the existing method also has the drawback that it will build unnecessary relationships or lose some relationships between concepts in the constructed concept maps. In this paper, we present a new method to automatically construct concept maps based on data mining techniques for adaptive learning systems. The proposed method can overcome the drawbacks of the existing method. It provides us a useful way to automatically construct concept maps in adaptive learning systems. }
}
@article{Erra2015143,
  title = {Approximate TF–IDF based on topic extraction from massive message stream using the \{GPU\} },
  journal = {Information Sciences },
  volume = {292},
  number = {0},
  pages = {143 - 161},
  year = {2015},
  note = {},
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2014.08.062},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025514008676},
  author = {Ugo Erra and Sabrina Senatore and Fernando Minnella and Giuseppe Caggianese},
  keywords = {Twitter},
  keywords = {TF–IDF},
  keywords = {\{GPU\}},
  keywords = {Topic extraction},
  keywords = {Frequent items},
  keywords = {Massive data stream },
  abstract = {Abstract The Web is a constantly expanding global information space that includes disparate types of data and resources. Recent trends demonstrate the urgent need to manage the large amounts of data stream, especially in specific domains of application such as critical infrastructure systems, sensor networks, log file analysis, search engines and more recently, social networks. All of these applications involve large-scale data-intensive tasks, often subject to time constraints and space complexity. Algorithms, data management and data retrieval techniques must be able to process data stream, i.e., process data as it becomes available and provide an accurate response, based solely on the data stream that has already been provided. Data retrieval techniques often require traditional data storage and processing approach, i.e., all data must be available in the storage space in order to be processed. For instance, a widely used relevance measure is Term Frequency–Inverse Document Frequency (TF–IDF), which can evaluate how important a word is in a collection of documents and requires to a priori know the whole dataset. To address this problem, we propose an approximate version of the TF–IDF measure suitable to work on continuous data stream (such as the exchange of messages, tweets and sensor-based log files). The algorithm for the calculation of this measure makes two assumptions: a fast response is required, and memory is both limited and infinitely smaller than the size of the data stream. In addition, to face the great computational power required to process massive data stream, we present also a parallel implementation of the approximate TF–IDF calculation using Graphical Processing Units (GPUs). This implementation of the algorithm was tested on generated and real data stream and was able to capture the most frequent terms. Our results demonstrate that the approximate version of the TF–IDF measure performs at a level that is comparable to the solution of the precise TF–IDF measure. }
}
@article{Sevindik20105120,
  title = {Virtual education environments and web mining },
  journal = {Procedia - Social and Behavioral Sciences },
  volume = {2},
  number = {2},
  pages = {5120 - 5124},
  year = {2010},
  note = {Innovation and Creativity in Education },
  issn = {1877-0428},
  doi = {http://dx.doi.org/10.1016/j.sbspro.2010.03.832},
  url = {http://www.sciencedirect.com/science/article/pii/S1877042810008724},
  author = {Tuncay Sevindik and Necmi Demirkeser and Zafer Cömert},
  keywords = {Attitude},
  keywords = {distance education},
  keywords = {education},
  keywords = {web mining},
  keywords = {virtual },
  abstract = {Nowadays, education environments have two forms. These are traditional and virtual education environments. In both environments, one of the important problems is habit analysis and evaluation. In traditional education systems, habit analysis is done easily through observation techniques, whereas in virtual ones analyzing students’ attitudes and habits is a significant problem. Web mining applications, used in providing meaningful information from meaningless habits and surfing in web environments, is the way to overcome this problem. In this study, web mining applications in virtual education environments were formed in accordance with descriptive research method. In conclusion, how web mining is used in education environments was discovered. }
}
@article{Tsai20102359,
  title = {Evaluation of novelty metrics for sentence-level novelty mining },
  journal = {Information Sciences },
  volume = {180},
  number = {12},
  pages = {2359 - 2374},
  year = {2010},
  note = {},
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2010.02.020},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025510000988},
  author = {Flora S. Tsai and Wenyin Tang and Kap Luk Chan},
  keywords = {Novelty metric},
  keywords = {Novelty detection},
  keywords = {Novelty mining},
  keywords = {Performance evaluation},
  keywords = {Information retrieval },
  abstract = {This work addresses the problem of detecting novel sentences from an incoming stream of text data, by studying the performance of different novelty metrics, and proposing a mixed metric that is able to adapt to different performance requirements. Existing novelty metrics can be divided into two types, symmetric and asymmetric, based on whether the ordering of sentences is taken into account. After a comparative study of several different novelty metrics, we observe complementary behavior in the two types of metrics. This finding motivates a new framework of novelty measurement, i.e. the mixture of both symmetric and asymmetric metrics. This new framework of novelty measurement performs superiorly under different performance requirements varying from high-precision to high-recall as well as for data with different percentages of novel sentences. Because it does not require any prior information, the new metric is very suitable for real-time knowledge base applications such as novelty mining systems where no training data is available beforehand. }
}
@article{Sundarkumar2015368,
  title = {A novel hybrid undersampling method for mining unbalanced datasets in banking and insurance },
  journal = {Engineering Applications of Artificial Intelligence },
  volume = {37},
  number = {0},
  pages = {368 - 377},
  year = {2015},
  note = {},
  issn = {0952-1976},
  doi = {http://dx.doi.org/10.1016/j.engappai.2014.09.019},
  url = {http://www.sciencedirect.com/science/article/pii/S0952197614002395},
  author = {G. Ganesh Sundarkumar and Vadlamani Ravi},
  keywords = {Insurance fraud detection},
  keywords = {Credit card churn prediction},
  keywords = {Undersampling},
  keywords = {K- Reverse Nearest Neighbourhood method},
  keywords = {One-class support vector machine },
  abstract = {Abstract In this paper, we propose a novel hybrid approach for rectifying the data imbalance problem by employing k Reverse Nearest Neighborhood and One Class support vector machine (OCSVM) in tandem. We mined an Automobile Insurance Fraud detection dataset and customer Credit Card Churn prediction dataset to demonstrate the effectiveness of the proposed model. Throughout the paper, we followed 10 fold cross validation method of testing using Decision Tree (DT), Support Vector Machine (SVM), Logistic Regression (LR), Probabilistic Neural Network (PNN), Group Method of Data Handling (GMDH), Multi-Layer Perceptron (MLP). We observed that \{DT\} and \{SVM\} respectively yielded high sensitivity of 90.74% and 91.89% on Insurance dataset and DT, \{SVM\} and \{GMDH\} respectively produced high sensitivity of 91.2%, 87.7%, and 83.1% on Credit Card Churn Prediction dataset. In the case of Insurance Fraud detection dataset, we found that statistically there is no significant difference between \{DT\} (J48) and SVM. As \{DT\} yields “if then” rules, we prefer \{DT\} over SVM. Further, in the case of churn prediction dataset, it turned out that GMDH, \{SVM\} and \{LR\} are not statistically different and \{GMDH\} yielded very high Area Under Curve at ROC. Further, \{DT\} yielded just 4 ‘if–then’ rules on Insurance and 10 rules on churn prediction datasets, which is the significant outcome of the study. }
}
@article{Chou20102898,
  title = {Integrating web mining and neural network for personalized e-commerce automatic service },
  journal = {Expert Systems with Applications },
  volume = {37},
  number = {4},
  pages = {2898 - 2910},
  year = {2010},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2009.09.047},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417409008288},
  author = {Pao-Hua Chou and Pi-Hsiang Li and Kuang-Ku Chen and Menq-Jiun Wu},
  keywords = {Electronic commerce},
  keywords = {Data mining},
  keywords = {Neural network},
  keywords = {Automatic service },
  abstract = {Electronic commerce (EC) has become a trend in the world nowadays. However, most researches neglect a fundamental issue – the user’s product-specific knowledge on which the useful intelligent systems are based. This research employs the user’s product-specific knowledge and mine his/her interior desire on appropriate target products as a part of personalization process to construct the overall \{EC\} strategy for businesses. This paper illustrates a novel web usage mining approach, based on the sequence mining technique applied to user’s navigation behaviour, to discover patterns in the navigation of websites. Three critical contributions are made in this paper: (1) using the footstep graph to visualize the user’s click-stream data and any interesting pattern can be detected more easily and quickly; (2) illustrating a novel sequence mining approach to identify pre-designated user navigation patterns automatically and integrates back-propagation network (BPN) model smoothly; and (3) applying the empirical research to indicate that the proposed approach can predict and categorize the users’ navigation behaviour with high accuracy. }
}
@article{RomeroMorales2010554,
  title = {Forecasting cancellation rates for services booking revenue management using data mining },
  journal = {European Journal of Operational Research },
  volume = {202},
  number = {2},
  pages = {554 - 562},
  year = {2010},
  note = {},
  issn = {0377-2217},
  doi = {http://dx.doi.org/10.1016/j.ejor.2009.06.006},
  url = {http://www.sciencedirect.com/science/article/pii/S0377221709004494},
  author = {Dolores Romero Morales and Jingbo Wang},
  keywords = {Revenue management},
  keywords = {Cancellation rate forecasting},
  keywords = {\{PNR\} data mining},
  keywords = {Two-class probability estimation},
  keywords = {Time-dependency },
  abstract = {Revenue management (RM) enhances the revenues of a company by means of demand-management decisions. An \{RM\} system must take into account the possibility that a booking may be canceled, or that a booked customer may fail to show up at the time of service (no-show). We review the Passenger Name Record data mining based cancellation rate forecasting models proposed in the literature, which mainly address the no-show case. Using a real-world dataset, we illustrate how the set of relevant variables to describe cancellation behavior is very different in different stages of the booking horizon, which not only confirms the dynamic aspect of this problem, but will also help revenue managers better understand the drivers of cancellation. Finally, we examine the performance of the state-of-the-art data mining methods when applied to Passenger Name Record based cancellation rate forecasting. }
}
@article{Guo201479,
  title = {From spoken narratives to domain knowledge: Mining linguistic data for medical image understanding },
  journal = {Artificial Intelligence in Medicine },
  volume = {62},
  number = {2},
  pages = {79 - 90},
  year = {2014},
  note = {},
  issn = {0933-3657},
  doi = {http://dx.doi.org/10.1016/j.artmed.2014.08.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0933365714000852},
  author = {Xuan Guo and Qi Yu and Cecilia Ovesdotter Alm and Cara Calvelli and Jeff B. Pelz and Pengcheng Shi and Anne R. Haake},
  keywords = {Unified Medical Language System},
  keywords = {Lexical consensus},
  keywords = {Semantic relatedness},
  keywords = {Clustering algorithm},
  keywords = {Image-based diagnostic reasoning},
  keywords = {Medical data analysis },
  abstract = {AbstractObjectives Extracting useful visual clues from medical images allowing accurate diagnoses requires physicians’ domain knowledge acquired through years of systematic study and clinical training. This is especially true in the dermatology domain, a medical specialty that requires physicians to have image inspection experience. Automating or at least aiding such efforts requires understanding physicians’ reasoning processes and their use of domain knowledge. Mining physicians’ references to medical concepts in narratives during image-based diagnosis of a disease is an interesting research topic that can help reveal experts’ reasoning processes. It can also be a useful resource to assist with design of information technologies for image use and for image case-based medical education systems. Methods and materials We collected data for analyzing physicians’ diagnostic reasoning processes by conducting an experiment that recorded their spoken descriptions during inspection of dermatology images. In this paper we focus on the benefit of physicians’ spoken descriptions and provide a general workflow for mining medical domain knowledge based on linguistic data from these narratives. The challenge of a medical image case can influence the accuracy of the diagnosis as well as how physicians pursue the diagnostic process. Accordingly, we define two lexical metrics for physicians’ narratives—lexical consensus score and top N relatedness score—and evaluate their usefulness by assessing the diagnostic challenge levels of corresponding medical images. We also report on clustering medical images based on anchor concepts obtained from physicians’ medical term usage. These analyses are based on physicians’ spoken narratives that have been preprocessed by incorporating the Unified Medical Language System for detecting medical concepts. Results The image rankings based on lexical consensus score and on top 1 relatedness score are well correlated with those based on challenge levels (Spearman correlation >0.5 and Kendall correlation >0.4). Clustering results are largely improved based on our anchor concept method (accuracy >70% and mutual information >80%). Conclusions Physicians’ spoken narratives are valuable for the purpose of mining the domain knowledge that physicians use in medical image inspections. We also show that the semantic metrics introduced in the paper can be successfully applied to medical image understanding and allow discussion of additional uses of these metrics. }
}
@article{PinoMejías2010826,
  title = {Predicting the potential habitat of oaks with data mining models and the R system },
  journal = {Environmental Modelling & Software },
  volume = {25},
  number = {7},
  pages = {826 - 836},
  year = {2010},
  note = {},
  issn = {1364-8152},
  doi = {http://dx.doi.org/10.1016/j.envsoft.2010.01.004},
  url = {http://www.sciencedirect.com/science/article/pii/S1364815210000150},
  author = {Rafael Pino-Mejías and María Dolores Cubiles-de-la-Vega and María Anaya-Romero and Antonio Pascual-Acosta and Antonio Jordán-López and Nicolás Bellinfante-Crocci},
  keywords = {Habitat modelling},
  keywords = {Supervised classification},
  keywords = {R system},
  keywords = {Data mining models},
  keywords = {Ensemble models},
  keywords = {Classification trees},
  keywords = {Neural networks},
  keywords = {Oaks},
  keywords = {Support vector machines },
  abstract = {Oak forests are essential for the ecosystems of many countries, particularly when they are used in vegetal restoration. Therefore, models for predicting the potential habitat of oaks can be a valuable tool for work in the environment. In accordance with this objective, the building and comparison of data mining models are presented for the prediction of potential habitats for the oak forest type in Mediterranean areas (southern Spain), with conclusions applicable to other regions. Thirty-one environmental input variables were measured and six base models for supervised classification problems were selected: linear and quadratic discriminant analysis, logistic regression, classification trees, neural networks and support vector machines. Three ensemble methods, based on the combination of classification tree models fitted from samples and sets of variables generated from the original data set were also evaluated: bagging, random forests and boosting. The available data set was randomly split into three parts: training set (50%), validation set (25%), and test set (25%). The analysis of the accuracy, the sensitivity, the specificity, together with the area under the \{ROC\} curve for the test set reveal that the best models for our oak data set are those of bagging and random forests. All of these models can be fitted by free R programs which use the libraries and functions described in this paper. Furthermore, the methodology used in this study will allow researchers to determine the potential distribution of oaks in other kinds of areas. }
}
@article{Yu20102459,
  title = {A load-balanced distributed parallel mining algorithm },
  journal = {Expert Systems with Applications },
  volume = {37},
  number = {3},
  pages = {2459 - 2464},
  year = {2010},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2009.07.074},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417409007428},
  author = {Kun-Ming Yu and Jiayi Zhou and Tzung-Pei Hong and Jia-Ling Zhou},
  keywords = {Parallel and distributed processing},
  keywords = {Cluster computing},
  keywords = {Frequent patterns},
  keywords = {Association rules},
  keywords = {Data mining },
  abstract = {Due to the exponential growth in worldwide information, companies have to deal with an ever growing amount of digital information. One of the most important challenges for data mining is quickly and correctly finding the relationship among data. The Apriori algorithm has been the most popular technique in finding frequent patterns. However, when applying this method, a database has to be scanned many times to calculate the counts of a huge number of candidate itemsets. Parallel and distributed computing is an effective strategy for accelerating the mining process. In this paper, the Distributed Parallel Apriori (DPA) algorithm is proposed as a solution to this problem. In the proposed method, metadata are stored in the form of Transaction Identifiers (TIDs), such that only a single scan to the database is needed. The approach also takes the factor of itemset counts into consideration, thus generating a balanced workload among processors and reducing processor idle time. Experiments on a \{PC\} cluster with 16 computing nodes are also made to show the performance of the proposed approach and compare it with some other parallel mining algorithms. The experimental results show that the proposed approach outperforms the others, especially while the minimum supports are low. }
}
@article{Hu2010190,
  title = {A data mining framework for time series estimation },
  journal = {Journal of Biomedical Informatics },
  volume = {43},
  number = {2},
  pages = {190 - 199},
  year = {2010},
  note = {},
  issn = {1532-0464},
  doi = {http://dx.doi.org/10.1016/j.jbi.2009.11.002},
  url = {http://www.sciencedirect.com/science/article/pii/S1532046409001488},
  author = {Xiao Hu and Peng Xu and Shaozhi Wu and Shadnaz Asgari and Marvin Bergsneider},
  keywords = {Data mining},
  keywords = {System identification},
  keywords = {Regression},
  keywords = {Time series },
  abstract = {Time series estimation techniques are usually employed in biomedical research to derive variables less accessible from a set of related and more accessible variables. These techniques are traditionally built from systems modeling approaches including simulation, blind decovolution, and state estimation. In this work, we define target time series (TTS) and its related time series (RTS) as the output and input of a time series estimation process, respectively. We then propose a novel data mining framework for time series estimation when \{TTS\} and \{RTS\} represent different sets of observed variables from the same dynamic system. This is made possible by mining a database of instances of TTS, its simultaneously recorded RTS, and the input/output dynamic models between them. The key mining strategy is to formulate a mapping function for each TTS–RTS pair in the database that translates a feature vector extracted from \{RTS\} to the dissimilarity between true \{TTS\} and its estimate from the dynamic model associated with the same TTS–RTS pair. At run time, a feature vector is extracted from an inquiry \{RTS\} and supplied to the mapping function associated with each TTS–RTS pair to calculate a dissimilarity measure. An optimal TTS–RTS pair is then selected by analyzing these dissimilarity measures. The associated input/output model of the selected TTS–RTS pair is then used to simulate the \{TTS\} given the inquiry \{RTS\} as an input. An exemplary implementation was built to address a biomedical problem of noninvasive intracranial pressure assessment. The performance of the proposed method was superior to that of a simple training-free approach of finding the optimal TTS–RTS pair by a conventional similarity-based search on \{RTS\} features. }
}
@article{Chen2010193,
  title = {Mining fuzzy frequent itemsets for hierarchical document clustering },
  journal = {Information Processing & Management },
  volume = {46},
  number = {2},
  pages = {193 - 211},
  year = {2010},
  note = {},
  issn = {0306-4573},
  doi = {http://dx.doi.org/10.1016/j.ipm.2009.09.009},
  url = {http://www.sciencedirect.com/science/article/pii/S0306457309001113},
  author = {Chun-Ling Chen and Frank S.C. Tseng and Tyne Liang},
  keywords = {Fuzzy association rule mining},
  keywords = {Text mining},
  keywords = {Hierarchical document clustering},
  keywords = {Frequent itemsets },
  abstract = {As text documents are explosively increasing in the Internet, the process of hierarchical document clustering has been proven to be useful for grouping similar documents for versatile applications. However, most document clustering methods still suffer from challenges in dealing with the problems of high dimensionality, scalability, accuracy, and meaningful cluster labels. In this paper, we will present an effective Fuzzy Frequent Itemset-Based Hierarchical Clustering (F2IHC) approach, which uses fuzzy association rule mining algorithm to improve the clustering accuracy of Frequent Itemset-Based Hierarchical Clustering (FIHC) method. In our approach, the key terms will be extracted from the document set, and each document is pre-processed into the designated representation for the following mining process. Then, a fuzzy association rule mining algorithm for text is employed to discover a set of highly-related fuzzy frequent itemsets, which contain key terms to be regarded as the labels of the candidate clusters. Finally, these documents will be clustered into a hierarchical cluster tree by referring to these candidate clusters. We have conducted experiments to evaluate the performance based on Classic4, Hitech, Re0, Reuters, and Wap datasets. The experimental results show that our approach not only absolutely retains the merits of FIHC, but also improves the accuracy quality of FIHC. }
}
@article{Rieping2014181,
  title = {Behavior analysis of elderly using topic models },
  journal = {Pervasive and Mobile Computing },
  volume = {15},
  number = {0},
  pages = {181 - 199},
  year = {2014},
  note = {Special Issue on Information Management in Mobile Applications Special Issue on Data Mining in Pervasive Environments },
  issn = {1574-1192},
  doi = {http://dx.doi.org/10.1016/j.pmcj.2014.07.001},
  url = {http://www.sciencedirect.com/science/article/pii/S1574119214001308},
  author = {Kristin Rieping and Gwenn Englebienne and Ben Kröse},
  keywords = {Activity discovery},
  keywords = {Sensor homes},
  keywords = {Pervasive computing},
  keywords = {Sequential patterns},
  keywords = {\{LDA\}},
  keywords = {Topic models },
  abstract = {Abstract This paper describes two new topic models for the analysis of human behavior in homes that are equipped with sensor networks. The models are based on Latent Dirichlet Allocation (LDA) topic models and can detect patterns in sensor data in an unsupervised manner. LDA–Gaussian, the first variation of the model, is a combination of a Gaussian Mixture Model and the \{LDA\} model. Here the multinomial distribution that is normally used in the \{LDA\} model is replaced by a set of Gaussian distributions. LDA–Poisson, the second variation of the model, uses a set of Poisson distribution to model the observations. The Poisson distribution is better suited to handle counts of stochastic events but less well-suited to model time. For this we use the von Mises distribution, resulting in ‘LDA–Poisson–von-Mises’. The parameters of the models are determined with an EM-algorithm. The models are evaluated on more than 450 days of real-world sensor data, gathered in the homes of five elderly people, and are compared with a baseline approach where standard k-means clustering is used to quantize the data. We show that the new models find more meaningful topics than the baseline and that a semantic description of these topics can be given. We also evaluated the models quantitatively, using perplexity as measure for the model fit. Both LDA–Gaussian and LDA–Poisson result in much better models than the baseline, and our experiments show that, of the proposed models, the LDA–Poisson–von-Mises model performs best. }
}
@article{Hsieh20104156,
  title = {A mining-based approach on discovering courses pattern for constructing suitable learning path },
  journal = {Expert Systems with Applications },
  volume = {37},
  number = {6},
  pages = {4156 - 4167},
  year = {2010},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2009.11.007},
  url = {http://www.sciencedirect.com/science/article/pii/S095741740900949X},
  author = {Tung-Cheng Hsieh and Tzone-I Wang},
  keywords = {Self-directed learner},
  keywords = {Data mining},
  keywords = {Formal Concept Analysis (FCA)},
  keywords = {Concept Lattice},
  keywords = {Learning path },
  abstract = {In recent years, browser has become one of the most popular tools for searching information on the Internet. Although a person can conveniently find and download specific learning materials to gain fragmented knowledge, most of the materials are imperfect and have no particular order in the content. Therefore, most of the self-directed learners spend most of time in surveying and choosing the right learning materials collected from the Internet. This paper develops a web-based learning support system that harnesses two approaches, the learning path constructing approach and the learning object recommending approach. With collected documents and a learning subject from a learner, the system first discovers some candidate courses by using a data mining approach based on the Apriori algorithm. Next, the leaning path constructing approach, based on the Formal Concept Analysis, builds a Concept Lattice, using keywords extracted from some selected documents, to form a relationship hierarchy of all the concepts represented by the keywords. It then uses \{FCA\} to further compute mutual relationships among documents to decide a suitable learning path. For a chosen learning path, the support system uses both the preference-based and the correlation-based algorithms for recommending the most suitable learning objects or documents for each unit of the courses in order to facilitate more efficient learning for the learner. This e-learning support system can be embedded in any information retrieval system for surfers to do more efficient learning on the Internet. }
}
@article{Kim200978,
  title = {Efficient management of marine resources in conflict: An empirical study of marine sand mining, Korea },
  journal = {Journal of Environmental Management },
  volume = {91},
  number = {1},
  pages = {78 - 86},
  year = {2009},
  note = {},
  issn = {0301-4797},
  doi = {http://dx.doi.org/10.1016/j.jenvman.2009.07.006},
  url = {http://www.sciencedirect.com/science/article/pii/S0301479709002370},
  author = {Tae-goun Kim},
  keywords = {Hotelling extraction rule},
  keywords = {Marine sand mining},
  keywords = {External costs},
  keywords = {Social optimum},
  keywords = {Marginal user cost },
  abstract = {This article develops a dynamic model of efficient use of exhaustible marine sand resources in the context of marine mining externalities. The classical Hotelling extraction model is applied to sand mining in Ongjin, Korea and extended to include the estimated marginal external costs that mining imposes on marine fisheries. The socially efficient sand extraction plan is compared with the extraction paths suggested by scientific research. If marginal environmental costs are correctly estimated, the developed efficient extraction plan considering the resource rent may increase the social welfare and reduce the conflicts among the marine sand resource users. The empirical results are interpreted with an emphasis on guidelines for coastal resource management policy. }
}
@article{Seiter2014215,
  title = {Discovery of activity composites using topic models: An analysis of unsupervised methods },
  journal = {Pervasive and Mobile Computing },
  volume = {15},
  number = {0},
  pages = {215 - 227},
  year = {2014},
  note = {Special Issue on Information Management in Mobile Applications Special Issue on Data Mining in Pervasive Environments },
  issn = {1574-1192},
  doi = {http://dx.doi.org/10.1016/j.pmcj.2014.05.007},
  url = {http://www.sciencedirect.com/science/article/pii/S1574119214000832},
  author = {Julia Seiter and Oliver Amft and Mirco Rossi and Gerhard Tröster},
  keywords = {Activity routines},
  keywords = {Daily routines},
  keywords = {Topic modeling},
  keywords = {Hierarchical activity recognition},
  keywords = {Activity discovery },
  abstract = {Abstract In this work we investigate unsupervised activity discovery approaches using three topic model (TM) approaches, based on Latent Dirichlet Allocation (LDA), n -gram TM (NTM), and correlated TM (CTM). While \{LDA\} structures activity primitives, \{NTM\} adds primitive sequence information, and \{CTM\} exploits co-occurring topics. We use an activity composite/primitive abstraction and analyze three public datasets with different properties that affect the discovery, including primitive rate, activity composite specificity, primitive sequence similarity, and composite-instance ratio. We compare the activity composite discovery performance among the \{TM\} approaches and against a baseline using k -means clustering. We provide guidelines for method and optimal \{TM\} parameter selection, depending on data properties and activity primitive noise. Results indicate that \{TMs\} can outperform k -means clustering up to 17%, when composite specificity is low. LDA-based \{TMs\} showed higher robustness against noise compared to other \{TMs\} and k -means. }
}
@incollection{Abedi2015184,
  title = {Chapter 13 - Literature Mining and Ontology Mapping Applied to Big Data },
  editor = {Akhgar, Babak and Arabnia, Gregory B. SaathoffHamid R. and Staniforth, Richard HillAndrew  and Bayerl, Petra Saskia },
  booktitle = {Application of Big Data for National Security },
  publisher = {Butterworth-Heinemann},
  edition = {},
  address = {},
  year = {2015},
  pages = {184 - 208},
  isbn = {978-0-12-801967-2},
  doi = {http://dx.doi.org/10.1016/B978-0-12-801967-2.00013-6},
  url = {http://www.sciencedirect.com/science/article/pii/B9780128019672000136},
  author = {Vida Abedi and Mohammed Yeasin and Ramin Zand},
  keywords = {Adaptive},
  keywords = {Associations},
  keywords = {Biomedical},
  keywords = {Interface},
  keywords = {Novel},
  keywords = {Semantic },
  abstract = {Abstract Discovering the network of associations and relationships among diseases, genes, and risk factors is critical in clinical and translational research. The goal of this study was to design a system that would enable strategic reading/filtering and reduce information overload, generate new hypotheses, bridge the knowledge gap, and develop “smart apps.” We present the implementation of a text analytic system, Adaptive Robust and Integrative Analysis for Finding Novel Associations (ARIANA). The system is context-specific, modular, and scalable and able to capture direct and indirect associations among 2,545 biomedical concepts. An easy-to-use Web interface was developed to query, interact, and visualize the results. Empirical studies showed that the system was able to find novel associations and generate new hypotheses. For instance, the system captured the association between the drug hexamethonium and pulmonary fibrosis, which in 2001 caused the tragic death of a healthy volunteer. The software is available with a properly executed end-user licensing agreement at http://www.ARIANAmed.org. }
}
@article{Lai20092023,
  title = {Integrating knowledge flow mining and collaborative filtering to support document recommendation },
  journal = {Journal of Systems and Software },
  volume = {82},
  number = {12},
  pages = {2023 - 2037},
  year = {2009},
  note = {},
  issn = {0164-1212},
  doi = {http://dx.doi.org/10.1016/j.jss.2009.06.044},
  url = {http://www.sciencedirect.com/science/article/pii/S0164121209001575},
  author = {Chin-Hui Lai and Duen-Ren Liu},
  keywords = {Knowledge flow},
  keywords = {Knowledge flow mining},
  keywords = {Knowledge sharing},
  keywords = {Document recommendation},
  keywords = {Collaborative filtering},
  keywords = {Sequential rule mining},
  keywords = {Recommender system },
  abstract = {Knowledge is a critical resource that organizations use to gain and maintain competitive advantages. In the constantly changing business environment, organizations must exploit effective and efficient methods of preserving, sharing and reusing knowledge in order to help knowledge workers find task-relevant information. Hence, an important issue is how to discover and model the knowledge flow (KF) of workers from their historical work records. The objectives of a knowledge flow model are to understand knowledge workers’ task-needs and the ways they reference documents, and then provide adaptive knowledge support. This work proposes hybrid recommendation methods based on the knowledge flow model, which integrates \{KF\} mining, sequential rule mining and collaborative filtering techniques to recommend codified knowledge. These KF-based recommendation methods involve two phases: a \{KF\} mining phase and a KF-based recommendation phase. The \{KF\} mining phase identifies each worker’s knowledge flow by analyzing his/her knowledge referencing behavior (information needs), while the KF-based recommendation phase utilizes the proposed hybrid methods to proactively provide relevant codified knowledge for the worker. Therefore, the proposed methods use workers’ preferences for codified knowledge as well as their knowledge referencing behavior to predict their topics of interest and recommend task-related knowledge. Using data collected from a research institute laboratory, experiments are conducted to evaluate the performance of the proposed hybrid methods and compare them with the traditional \{CF\} method. The results of experiments demonstrate that utilizing the document preferences and knowledge referencing behavior of workers can effectively improve the quality of recommendations and facilitate efficient knowledge sharing. }
}
@article{Wang2009726,
  title = {Literature mining on pharmacokinetics numerical data: A feasibility study },
  journal = {Journal of Biomedical Informatics },
  volume = {42},
  number = {4},
  pages = {726 - 735},
  year = {2009},
  note = {},
  issn = {1532-0464},
  doi = {http://dx.doi.org/10.1016/j.jbi.2009.03.010},
  url = {http://www.sciencedirect.com/science/article/pii/S1532046409000495},
  author = {Zhiping Wang and Seongho Kim and Sara K. Quinney and Yingying Guo and Stephen D. Hall and Luis M. Rocha and Lang Li},
  keywords = {Clearance},
  keywords = {Data mining},
  keywords = {Entity recognition},
  keywords = {Information extraction},
  keywords = {Linear mixed model},
  keywords = {Midazolam},
  keywords = {Pharmacokinetics },
  abstract = {A feasibility study of literature mining is conducted on drug \{PK\} parameter numerical data with a sequential mining strategy. Firstly, an entity template library is built to retrieve pharmacokinetics relevant articles. Then a set of tagging and extraction rules are applied to retrieve \{PK\} data from the article abstracts. To estimate the \{PK\} parameter population-average mean and between-study variance, a linear mixed meta-analysis model and an E–M algorithm are developed to describe the probability distributions of \{PK\} parameters. Finally, a cross-validation procedure is developed to ascertain false-positive mining results. Using this approach to mine midazolam (MDZ) \{PK\} data, an 88% precision rate and 92% recall rate are achieved, with an F-score = 90%. It greatly out-performs a conventional data mining approach (support vector machine), which has an F-score of 68.1%. Further investigate on 7 more drugs reveals comparable performances of our sequential mining approach. }
}
@article{Wang2015101,
  title = {Mining of protein–protein interfacial residues from massive protein sequential and spatial data },
  journal = {Fuzzy Sets and Systems },
  volume = {258},
  number = {0},
  pages = {101 - 116},
  year = {2015},
  note = {Special issue: Uncertainty in Learning from Big Data },
  issn = {0165-0114},
  doi = {http://dx.doi.org/10.1016/j.fss.2014.01.017},
  url = {http://www.sciencedirect.com/science/article/pii/S0165011414001109},
  author = {Debby D. Wang and Weiqiang Zhou and Hong Yan},
  keywords = {Protein–protein interface prediction},
  keywords = {3D alpha shape modeling},
  keywords = {Residue sequence profile},
  keywords = {Joint mutual information (JMI)},
  keywords = {Neuro-fuzzy classifiers (NFCs)},
  keywords = {Neighborhood classifiers (NECs)},
  keywords = {\{CART\}},
  keywords = {Extreme learning machines (ELMs)},
  keywords = {Naive Bayesian classifiers (NBCs) },
  abstract = {Abstract It is a great challenge to process big data in bioinformatics. In this paper, we addressed the problem of identifying protein–protein interfacial residues from massive protein structural data. A protein set, comprising 154 993 residues, was analyzed. We applied the three-dimensional alpha shape modeling to the search of surface and interfacial residues in this set, and adopted the spatially neighboring residue profiles to characterize each residue. These residue profiles, which revealed the sequential and spatial information of proteins, translated the original data into a large matrix. After vertically and horizontally refining this matrix, we comparably implemented a series of popular learning procedures, including neuro-fuzzy classifiers (NFCs), CART, neighborhood classifiers (NECs), extreme learning machines (ELMs) and naive Bayesian classifiers (NBCs), to predict the interfacial residues, aiming to investigate the sensitivity of these massive structural data to different learning mechanisms. As a consequence, ELMs, \{CART\} and \{NFCs\} performed better in terms of computational costs; NFCs, \{NBCs\} and \{ELMs\} provided favorable prediction accuracies. Overall, NFCs, \{NBCs\} and \{ELMs\} are favourable choices for fastly and accurately handling this type of data. More importantly, the marginal differences between the prediction performances of these methods imply the insensitivity of this type of data to different learning mechanisms. }
}
@article{Lee201566,
  title = {Mining the function of protein tyrosine phosphatases in health and disease },
  journal = {Seminars in Cell & Developmental Biology },
  volume = {37},
  number = {0},
  pages = {66 - 72},
  year = {2015},
  note = {Cell signaling, division, and organization mediated by intrinsically disordered proteins & Protein tyrosine phosphatases },
  issn = {1084-9521},
  doi = {http://dx.doi.org/10.1016/j.semcdb.2014.09.021},
  url = {http://www.sciencedirect.com/science/article/pii/S1084952114002766},
  author = {Hojin Lee and Jae-Sung Yi and Ahmed Lawan and Kisuk Min and Anton M. Bennett},
  keywords = {Protein tyrosine phosphatases},
  keywords = {Phosphorylation},
  keywords = {Substrates},
  keywords = {Signal transduction},
  keywords = {Proteomics},
  keywords = {Genomics },
  abstract = {Abstract Protein tyrosine phosphatases (PTPs) play a crucial role in the regulation of human health and it is now clear that \{PTP\} dysfunction is causal to a variety of human diseases. Research in the \{PTP\} field has accelerated dramatically over the last decade fueled by cutting-edge technologies in genomic and proteomic techniques. This system-wide non-biased approach when applied to the discovery of \{PTP\} function has led to the elucidation of new and unanticipated roles for the PTPs. These discoveries, driven by genomic and proteomic approaches, have uncovered novel \{PTP\} findings that range from those that describe fundamental cell signaling mechanisms to implications for \{PTPs\} as novel therapeutic targets for the treatment of human disease. This review will discuss how new \{PTP\} functions have been uncovered through studies that have utilized genomic and proteomic technologies and strategies. }
}
@article{Pearson2009e97,
  title = {Influence of the MedDRA® hierarchy on pharmacovigilance data mining results },
  journal = {International Journal of Medical Informatics },
  volume = {78},
  number = {12},
  pages = {e97 - e103},
  year = {2009},
  note = {Mining of Clinical and Biomedical Text and Data Special Issue },
  issn = {1386-5056},
  doi = {http://dx.doi.org/10.1016/j.ijmedinf.2009.01.001},
  url = {http://www.sciencedirect.com/science/article/pii/S1386505609000033},
  author = {Ronald K. Pearson and Manfred Hauben and David I. Goldsmith and A. Lawrence Gould and David Madigan and Donald J. O’Hara and Stephanie J. Reisinger and Alan M. Hochberg},
  keywords = {MedDRA},
  keywords = {Data mining},
  keywords = {Terminology},
  keywords = {Proportional reporting rate},
  keywords = {Urn model},
  keywords = {Gamma Poisson Shrinker },
  abstract = {Purpose To compare the results of drug safety data mining with three different algorithms, when adverse events are identified using MedDRA11MedDRA® is a registered trademark of the International Federation of Pharmaceutical Manufacturers and Associations (IFPMA). Preferred Terms (PT) vs. High Level Terms (HLT) vs. Standardised MedDRA Queries (SMQ). Methods For a representative set of 26 drugs, data from the \{FDA\} Adverse Event Reporting System (AERS) database from 2001 through 2005 was mined for signals of disproportionate reporting (SDRs) using three different data mining algorithms (DMAs): the Gamma Poisson Shrinker (GPS), the urn-model algorithm (URN), and the proportional reporting rate (PRR) algorithm. Results were evaluated using a previously described Reference Event Database (RED) which contains documented drug–event associations for the 26 drugs. Analysis emphasized the percentage of \{SDRs\} in the “unlabeled supported” category, corresponding to those adverse events that were not described in the U.S. prescribing information for the drug at the time of its approval, but which were supported by some published evidence for an association with the drug. Results Based on a logistic regression analysis, the percentage of unlabeled supported \{SDRs\} was smallest at the \{PT\} level, intermediate at the \{HLT\} level, and largest at the \{SMQ\} level, for all three algorithms. The \{GPS\} and \{URN\} methods detected comparable percentages of unlabeled supported \{SDRs\} while the \{PRR\} method detected a smaller percentage, at all three MedDRA levels. No evidence of a method/level interaction was seen. Conclusions Use of \{HLT\} and \{SMQ\} groupings can improve the percentage of unlabeled supported \{SDRs\} in data mining results. The trade-off for this gain is the medically less-specific language of \{HLTs\} and \{SMQs\} compared to PTs, and the need for the added step in data mining of examining the component \{PTs\} of each \{HLT\} or \{SMQ\} that results in a signal of disproportionate reporting. }
}
@article{Wang20091647,
  title = {Analyzing on the selecting behavior of mining cities’ industrial transition based on the viewpoint of sustainable development: a perspective of evolutionary game },
  journal = {Procedia Earth and Planetary Science },
  volume = {1},
  number = {1},
  pages = {1647 - 1653},
  year = {2009},
  note = {special issue title: Proceedings of the International Conference on Mining Science & Technology (ICMST2009) },
  issn = {1878-5220},
  doi = {http://dx.doi.org/10.1016/j.proeps.2009.09.253},
  url = {http://www.sciencedirect.com/science/article/pii/S1878522009002549},
  author = {Bang-jun Wang and Min Zhou and Feng Ji},
  keywords = {mining city},
  keywords = {sustainable development},
  keywords = {industry transition},
  keywords = {selective behaviour},
  keywords = {evolutionary game },
  abstract = {At present, the mining city is faced up with some serious problems, such as the singular essential production factor structure, its abnormal industrial structure, short industrial value chain, low added value, hermetical city industrial system, weakening ability to innovate.. To seek for sustainable development, the mining city has to make a key decision in the process of industry transition. Firstly, based on the perspective of evolutionary game, the paper has analyzed the state transcending equation which refers to the problem of whether the mining city needs to restructure its industry or not and its corresponding behaviour gaming under the different states of space set. So, the decision-making threshold on its industry transition has also been estimated from the former analysis. Secondly, two different industry transition types of mining cities are analyzed from the dynamic perspective through which the each equilibrium of those games’ evolutionary strategies have also been deduced. One type is a substituting structure system in which the city withdraws entirely from the current resource-based industry and constitutes a whole new industry structure. The other type is upgrading the mining city’s industrial structure through implementing technological transformation and industrial innovation. Lastly, the paper suggests some focused measures for the mining city on its future pattern of sustainable development on the basis of previous analyses. }
}
@article{Lian2014476,
  title = {Constructing protein-protein interaction network of hypertension with blood stasis syndrome via digital gene expression sequencing and database mining },
  journal = {Journal of Integrative Medicine },
  volume = {12},
  number = {6},
  pages = {476 - 482},
  year = {2014},
  note = {},
  issn = {2095-4964},
  doi = {http://dx.doi.org/10.1016/S2095-4964(14)60058-3},
  url = {http://www.sciencedirect.com/science/article/pii/S2095496414600583},
  author = {Yong-hong Lian and Mei-xia Fang and Li-guo Chen},
  keywords = {blood-stasis syndrome},
  keywords = {hypertension},
  keywords = {digital gene expression},
  keywords = {protein interaction mapping },
  abstract = {Objective To construct a protein-protein interaction (PPI) network in hypertension patients with blood-stasis syndrome (BSS) by using digital gene expression (DGE) sequencing and database mining techniques. Methods \{DGE\} analysis based on the Solexa Genome Analyzer platform was performed on vascular endothelial cells incubated with serum of hypertension patients with BSS. The differentially expressed genes were filtered by comparing the expression levels between the different experimental groups. Then functional categories and enriched pathways of the unique genes for \{BSS\} were analyzed using Database for Annotation, Visualization and Integrated Discovery (DAVID) to select those in the enrichment pathways. Interologous Interaction Database (I2D) was used to construct \{PPI\} networks with the selected genes for hypertension patients with BSS. The potential candidate genes related to \{BSS\} were identified by comparing the number of relationships among genes. Confirmed by quantitative reverse transcription-polymerase chain reaction (qRT-PCR), gene ontology (GO) analysis was used to infer the functional annotations of the potential candidate genes for BSS. Results With gene enrichment analysis using DAVID, a list of 58 genes was chosen from the unique genes. The selected 58 genes were analyzed using I2D, and a \{PPI\} network was constructed. Based on the network analysis results, candidate genes for \{BSS\} were identified: DDIT3, JUN, HSPA8, NFIL3, HSPA5, HIST2H2BE, H3F3B, CEBPB, \{SAT1\} and GADD45A. Verified through qRT-PCR and analyzed by GO, the functional annotations of the potential candidate genes were explored. Conclusion Compared with previous methodologies reported in the literature, the present \{DGE\} analysis and data mining method have shown a great improvement in analyzing BSS. }
}
@article{Baker2010510,
  title = {Mining connections between chemicals, proteins, and diseases extracted from Medline annotations },
  journal = {Journal of Biomedical Informatics },
  volume = {43},
  number = {4},
  pages = {510 - 519},
  year = {2010},
  note = {},
  issn = {1532-0464},
  doi = {http://dx.doi.org/10.1016/j.jbi.2010.03.008},
  url = {http://www.sciencedirect.com/science/article/pii/S1532046410000419},
  author = {Nancy C. Baker and Bradley M. Hemminger},
  keywords = {Literature-based discovery},
  keywords = {Drug discovery},
  keywords = {Text mining },
  abstract = {The biomedical literature is an important source of information about the biological activity and effects of chemicals. We present an application that extracts terms indicating biological activity of chemicals from Medline records, associates them with chemical name and stores the terms in a repository called ChemoText. We describe the construction of ChemoText and then demonstrate its utility in drug research by employing Swanson’s \{ABC\} discovery paradigm. We reproduce Swanson’s discovery of a connection between magnesium and migraine in a novel approach that uses only proteins as the intermediate B terms. We validate our methods by using a cutoff date and evaluate them by calculating precision and recall. In addition to magnesium, we have identified valproic acid and nitric oxide as chemicals which developed links to migraine. We hypothesize, based on protein annotations, that zinc and retinoic acid may play a role in migraine. The ChemoText repository has promise as a data source for drug discovery. }
}
@article{LI2009144,
  title = {Strongest Association Rules Mining for Personalized Recommendation },
  journal = {Systems Engineering - Theory & Practice },
  volume = {29},
  number = {8},
  pages = {144 - 152},
  year = {2009},
  note = {},
  issn = {1874-8651},
  doi = {http://dx.doi.org/10.1016/S1874-8651(10)60064-6},
  url = {http://www.sciencedirect.com/science/article/pii/S1874865110600646},
  author = {Jie LI and Yong XU and Yun-feng WANG and Chao-hsien CHU},
  keywords = {data mining},
  keywords = {association rules},
  keywords = {personalized recommendation},
  keywords = {strongest association rules },
  abstract = {The article proposed the notion of strongest association rules (SAR), developed a matrix-based algorithm for mining \{SAR\} set. As the subset of the whole association rule set, \{SAR\} set includes much less rules with the special suitable form for personalized recommendation without information loss. With the \{SAR\} set mining algorithm, the transaction database is only scanned for once, the matrix scale becomes smaller and smaller, so that the mining efficiency is improved. Experiments with three data sets show that the number of rules in \{SAR\} set in average is only 26.2 percent of the total number of whole association rules, which mitigates the explosion of association rules. }
}
@article{Liao200911045,
  title = {Ontology-based data mining approach implemented for sport marketing },
  journal = {Expert Systems with Applications },
  volume = {36},
  number = {8},
  pages = {11045 - 11056},
  year = {2009},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2009.02.087},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417409002383},
  author = {Shu-Hsien Liao and Jen-Lung Chen and Tze-Yuan Hsu},
  keywords = {Sport marketing},
  keywords = {Endorser},
  keywords = {Media},
  keywords = {Ontology},
  keywords = {Data mining},
  keywords = {Apriori algorithm},
  keywords = {Clustering analysis },
  abstract = {Since sport marketing is a commercial activity, precise customer and marketing segmentation must be investigated frequently and it would help to know the sport market after a specific customer profile, segmentation, or pattern come with marketing activities has found. Such knowledge would not only help sport firms, but would also contribute to the broader field of sport customer behavior and marketing. This paper proposes using the Apriori algorithm of association rules, and clustering analysis based on an ontology-based data mining approach, for mining customer knowledge from the database. Knowledge extracted from data mining results is illustrated as knowledge patterns, rules, and maps in order to propose suggestions and solutions to the case firm, Taiwan Adidas, for possible product promotion and sport marketing. }
}
@article{Hong200847,
  title = {Linguistic object-oriented web-usage mining },
  journal = {International Journal of Approximate Reasoning },
  volume = {48},
  number = {1},
  pages = {47 - 61},
  year = {2008},
  note = {Special Section: Perception Based Data Mining and Decision Support Systems },
  issn = {0888-613X},
  doi = {http://dx.doi.org/10.1016/j.ijar.2007.06.006},
  url = {http://www.sciencedirect.com/science/article/pii/S0888613X07000771},
  author = {Tzung-Pei Hong and Cheng-Ming Huang and Shi-Jinn Horng},
  keywords = {Data mining},
  keywords = {Web mining},
  keywords = {Association rule},
  keywords = {Browsing pattern},
  keywords = {Object-oriented log data },
  abstract = {Web mining has become a very important research topic in the field of data mining due to the vast amount of world wide web services in recent years. The fuzzy and the object concepts have also been very popular and used in a variety of applications, especially for complex data description. This paper thus proposes a new fuzzy object-oriented web mining algorithm to derive fuzzy knowledge from object data log on web servers. Each web page itself is thought of as a class, and each web page browsed by a client is thought of as an instance. Instances with the same class (web page) may have different quantitative attribute values since they may appear in different clients. The proposed fuzzy mining algorithm can be divided into two main phases. The first phase is called the fuzzy intra-page mining phase, in which the linguistic large itemsets associated with the same classes (pages) but with different attributes are derived. Each linguistic large itemset found in this phase is then thought of as a composite item used in phase 2. The second phase is called the fuzzy inter-page mining phase, in which the large sequences are derived and used to represent the relationship among different web pages. Both the intra-page linguistic association rules and inter-page linguistic browsing patterns can thus be easily derived by the proposed algorithm at the same time. An example is given to illustrate the proposed algorithm. Experimental results also show the effects of the parameters used in the algorithm. }
}
@article{MartínVide2010261,
  title = {Incision due to gravel mining: Modeling a case study from the Gállego River, Spain },
  journal = {Geomorphology },
  volume = {117},
  number = {3–4},
  pages = {261 - 271},
  year = {2010},
  note = {Introduction to Management of Large Rivers },
  issn = {0169-555X},
  doi = {http://dx.doi.org/10.1016/j.geomorph.2009.01.019},
  url = {http://www.sciencedirect.com/science/article/pii/S0169555X09002438},
  author = {J.P. Martín-Vide and C. Ferrer-Boix and A. Ollero},
  keywords = {Bed degradation},
  keywords = {Gravel mining},
  keywords = {Bedload transport},
  keywords = {Sediment budget},
  keywords = {Mathematical modeling of river morphology},
  keywords = {Gállego River },
  abstract = {Historical information on river degradation was used in a case study on the Gállego River, a tributary of the Ebro River, in northeastern Spain. The Gállego drains about 4000 km2 of the southern slopes of the Pyrenees. Good channel surveys since the 1940s allowed a comparison of longitudinal bed profiles over time. Over this period about 1 million m3 of gravel was mined according to the archival records. The volume of alluvium lost due to incision in the same period was 2 million m3. This imbalance is tentatively explained by a budget model based on a bedload equation and an algorithm to determine whether the effective bedload transport is controlled by the transport capacity or by the supply of sediment. It appears that the incision process has changed the magnitude of the shear stresses on channel bed. As the river became deeper, the channel could accommodate higher discharges without overbank flooding. The results obtained from a second model based on diffusion equation for the bed elevation compared well with the field data. This model is based on the hypotheses of steady uniform flow regarding water and sediment conservation. }
}
@article{Käkilehto2009e68,
  title = {Data mining of clinical oral health documents for analysis of the longevity of different restorative materials in Finland },
  journal = {International Journal of Medical Informatics },
  volume = {78},
  number = {12},
  pages = {e68 - e74},
  year = {2009},
  note = {Mining of Clinical and Biomedical Text and Data Special Issue },
  issn = {1386-5056},
  doi = {http://dx.doi.org/10.1016/j.ijmedinf.2009.04.004},
  url = {http://www.sciencedirect.com/science/article/pii/S1386505609000719},
  author = {Taina Käkilehto and Sinikka Salo and Markku Larmas},
  keywords = {Data mining},
  keywords = {Dental restorative materials},
  keywords = {Longevity},
  keywords = {Survival analysis },
  abstract = {Evidence-based dentistry has shown that different restorative materials have different survival times. Our primary hypothesis is that this should be revealed from normal dental records by the use of data mining technique and a practice-based dentistry approach analysed in a scientifically sound way. Dental records from 1626 patients and altogether 19,892 restorations in three Finnish age cohorts were analysed. Survival curves (Kaplan–Meier) were drawn for each of the restorative materials. Median survival times for amalgam and resin-based composites were more than 15 years in older cohorts. More than 60% of silicate cement restorations were replaced within 5 years, and more than 50% of glass ionomers within 7 years. There was a significant reduction in the longevity of amalgams in the 1980 cohort. Data mining of digital oral heath documents would be a useful tool to analyse survival curves of new restorative materials in a practice-based manner in real-life conditions. }
}
@article{Berendt20101,
  title = {Intelligent scientific authoring tools: Interactive data mining for constructive uses of citation networks },
  journal = {Information Processing & Management },
  volume = {46},
  number = {1},
  pages = {1 - 10},
  year = {2010},
  note = {},
  issn = {0306-4573},
  doi = {http://dx.doi.org/10.1016/j.ipm.2009.08.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0306457309000880},
  author = {B. Berendt and B. Krause and S. Kolbe-Nusser},
  keywords = {[H.2.8] Database management – database applications – data mining},
  keywords = {[H.3.7] Information storage and retrieval – digital libraries – user issues},
  keywords = {[H.3.3] Information storage and retrieval – information search and retrieval – search process, information filtering},
  keywords = {[H.3.5] Information storage and retrieval – online information services – Web-based services},
  keywords = {[K.3.2] Computers and education – computer and information science education – literacy},
  keywords = {Citation analysis },
  abstract = {Many powerful methods and tools exist for extracting meaning from scientific publications, their texts, and their citation links. However, existing proposals often neglect a fundamental aspect of learning: that understanding and learning require an active and constructive exploration of a domain. In this paper, we describe a new method and a tool that use data mining and interactivity to turn the typical search and retrieve dialogue, in which the user asks questions and a system gives answers, into a dialogue that also involves sense-making, in which the user has to become active by constructing a bibliography and a domain model of the search term(s). This model starts from an automatically generated and annotated clustering solution that is iteratively modified by users. The tool is part of an integrated authoring system covering all phases from search through reading and sense-making to writing. Two evaluation studies demonstrate the usability of this interactive and constructive approach, and they show that clusters and groups represent identifiable sub-topics. }
}
@article{Zhang200910863,
  title = {Mining globally interesting patterns from multiple databases using kernel estimation },
  journal = {Expert Systems with Applications },
  volume = {36},
  number = {8},
  pages = {10863 - 10869},
  year = {2009},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2009.01.030},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417409000700},
  author = {Shichao Zhang and Xiaofang You and Zhi Jin and Xindong Wu},
  keywords = {Multiple database mining},
  keywords = {Global pattern},
  keywords = {Multiple data source discovery },
  abstract = {When extracting knowledge (or patterns) from multiple databases, the data from different databases might be too large in volume to be merged into one database for centralized mining on one computer, the local information sources might be hidden from a global decision maker due to privacy concerns, and different local databases may have different contribution to the global pattern. Dealing with multiple databases is essentially different from mining from a single database. In multi-database mining, the global patterns must be obtained by carefully analyzing the local patterns from individual databases. In this paper, we propose a nonlinear method, named \{KEMGP\} (kernel estimation for mining global patterns), to tackle this problem, which adopts kernel estimation to synthesizing local patterns for global patterns. We also adopt a method to divide all the data in different databases according to attribute dimensionality, which reduces the total space complexity. We test our algorithm on a customer management system, where the application is to obtain all globally interesting patterns by analyzing the individual databases. The experimental results show that our method is efficient. }
}
@article{Alatas2009455,
  title = {Multi-objective rule mining using a chaotic particle swarm optimization algorithm },
  journal = {Knowledge-Based Systems },
  volume = {22},
  number = {6},
  pages = {455 - 460},
  year = {2009},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2009.06.004},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705109000902},
  author = {Bilal Alatas and Erhan Akin},
  keywords = {Data mining},
  keywords = {Multi-objective optimization},
  keywords = {Particle swarm optimization},
  keywords = {Chaotic maps },
  abstract = {In this paper, classification rule mining which is one of the most studied tasks in data mining community has been modeled as a multi-objective optimization problem with predictive accuracy and comprehensibility objectives. A multi-objective chaotic particle swarm optimization (PSO) method has been introduced as a search strategy to mine classification rules within datasets. The used extension to \{PSO\} uses similarity measure for neighborhood and far-neighborhood search to store the global best particles found in multi-objective manner. For the bi-objective problem of rule mining of high accuracy/comprehensibility, the multi-objective approach is intended to allow the \{PSO\} algorithm to return an approximation to the upper accuracy/comprehensibility border, containing solutions that are spread across the border. The experimental results show the efficiency of the algorithm. }
}
@article{Choudhary2009728,
  title = {The needs and benefits of Text Mining applications on Post-Project Reviews },
  journal = {Computers in Industry },
  volume = {60},
  number = {9},
  pages = {728 - 740},
  year = {2009},
  note = {},
  issn = {0166-3615},
  doi = {http://dx.doi.org/10.1016/j.compind.2009.05.006},
  url = {http://www.sciencedirect.com/science/article/pii/S016636150900102X},
  author = {A.K. Choudhary and P.I. Oluikpe and J.A. Harding and P.M. Carrillo},
  keywords = {Text Mining},
  keywords = {Knowledge discovery},
  keywords = {Post-Project Reviews (PPRs)},
  keywords = {Manufacturing and construction },
  abstract = {Post-Project Reviews (PPRs) are a rich source of knowledge and data for organisations – if organisations have the time and resources to analyse them. Too often these reports are stored, unread by many who could benefit from them. \{PPR\} reports attempt to document the project experience – both good and bad. If these reports were analysed collectively, they may expose important detail, e.g. recurring problems or examples of good practice, perhaps repeated across a number of projects. However, because most companies do not have the resources to thoroughly examine \{PPR\} reports, either individually or collectively, important insights and opportunities to learn from previous projects, are missed. This research explores the application of knowledge discovery techniques and Text Mining to uncover patterns, associations, and trends from \{PPR\} reports. The results might then be used to address problem areas, enhance processes, and improve customer relationships. A case study related to two construction companies is presented in this paper and knowledge discovery techniques are used to analyse 50 \{PPR\} reports collected during the last three years. The case study has been examined in six contexts and the results show that Text Mining has a good potential to improve overall knowledge reuse and exploitation. }
}
@article{Roy2014151,
  title = {Special issue on data mining in pervasive environments },
  journal = {Pervasive and Mobile Computing },
  volume = {15},
  number = {0},
  pages = {151 - 152},
  year = {2014},
  note = {Special Issue on Information Management in Mobile Applications Special Issue on Data Mining in Pervasive Environments },
  issn = {1574-1192},
  doi = {http://dx.doi.org/10.1016/j.pmcj.2014.10.004},
  url = {http://www.sciencedirect.com/science/article/pii/S1574119214001679},
  author = {Nirmalya Roy and Parisa Rashidi and Larry Holder and Liming Chen}
}
@article{Magkos20091224,
  title = {Accurate and large-scale privacy-preserving data mining using the election paradigm },
  journal = {Data & Knowledge Engineering },
  volume = {68},
  number = {11},
  pages = {1224 - 1236},
  year = {2009},
  note = {Including Special Section: Conference on Privacy in Statistical Databases (PSD 2008) – Six selected and extended papers on Database Privacy },
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2009.06.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X09000950},
  author = {Emmanouil Magkos and Manolis Maragoudakis and Vassilis Chrissikopoulos and Stefanos Gritzalis},
  keywords = {Security and privacy},
  keywords = {Distributed data mining},
  keywords = {Homomorphic encryption},
  keywords = {Random Forests classification },
  abstract = {With the proliferation of the Web and \{ICT\} technologies there have been concerns about the handling and use of sensitive information by data mining systems. Recent research has focused on distributed environments where the participants in the system may also be mutually mistrustful. In this paper we discuss the design and security requirements for large-scale privacy-preserving data mining (PPDM) systems in a fully distributed setting, where each client possesses its own records of private data. To this end we argue in favor of using some well-known cryptographic primitives, borrowed from the literature on Internet elections. More specifically, our framework is based on the classical homomorphic election model, and particularly on an extension for supporting multi-candidate elections. We also review a recent scheme [Z. Yang, S. Zhong, R.N. Wright, Privacy-preserving classification of customer data without loss of accuracy, in: SDM’ 2005 \{SIAM\} International Conference on Data Mining, 2005] which was the first scheme that used the homomorphic encryption primitive for \{PPDM\} in the fully distributed setting. Finally, we show how our approach can be used as a building block to obtain Random Forests classification with enhanced prediction performance. }
}
@article{Delen20081707,
  title = {Seeding the survey and analysis of research literature with text mining },
  journal = {Expert Systems with Applications },
  volume = {34},
  number = {3},
  pages = {1707 - 1720},
  year = {2008},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2007.01.035},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417407000486},
  author = {Dursun Delen and Martin D. Crossland},
  keywords = {Text mining},
  keywords = {Data mining},
  keywords = {Literature survey},
  keywords = {Information extraction},
  keywords = {Categorization},
  keywords = {Clustering},
  keywords = {Classification },
  abstract = {Text mining is a semi-automated process of extracting knowledge from a large amount of unstructured data. Given that the amount of unstructured data being generated and stored is increasing rapidly, the need for automated means to process it is also increasing. In this study, we present, discuss and evaluate the techniques used to perform text mining on collections of textual information. A case study is presented using text mining to identify clusters and trends of related research topics from three major journals in the management information systems field. Based on the findings of this case study, it is proposed that this type of analysis could potentially be valuable for researchers in any field. }
}
@article{Oztekin2009e84,
  title = {Predicting the graft survival for heart–lung transplantation patients: An integrated data mining methodology },
  journal = {International Journal of Medical Informatics },
  volume = {78},
  number = {12},
  pages = {e84 - e96},
  year = {2009},
  note = {Mining of Clinical and Biomedical Text and Data Special Issue },
  issn = {1386-5056},
  doi = {http://dx.doi.org/10.1016/j.ijmedinf.2009.04.007},
  url = {http://www.sciencedirect.com/science/article/pii/S1386505609000707},
  author = {Asil Oztekin and Dursun Delen and Zhenyu (James) Kong},
  keywords = {Survival analysis},
  keywords = {Combined heart–lung transplantation},
  keywords = {Classification},
  keywords = {Data mining},
  keywords = {Cox proportional hazards models },
  abstract = {Background Predicting the survival of heart–lung transplant patients has the potential to play a critical role in understanding and improving the matching procedure between the recipient and graft. Although voluminous data related to the transplantation procedures is being collected and stored, only a small subset of the predictive factors has been used in modeling heart–lung transplantation outcomes. The previous studies have mainly focused on applying statistical techniques to a small set of factors selected by the domain-experts in order to reveal the simple linear relationships between the factors and survival. The collection of methods known as ‘data mining’ offers significant advantages over conventional statistical techniques in dealing with the latter's limitations such as normality assumption of observations, independence of observations from each other, and linearity of the relationship between the observations and the output measure(s). There are statistical methods that overcome these limitations. Yet, they are computationally more expensive and do not provide fast and flexible solutions as do data mining techniques in large datasets. Purpose The main objective of this study is to improve the prediction of outcomes following combined heart–lung transplantation by proposing an integrated data-mining methodology. Methods A large and feature-rich dataset (16,604 cases with 283 variables) is used to (1) develop machine learning based predictive models and (2) extract the most important predictive factors. Then, using three different variable selection methods, namely, (i) machine learning methods driven variables—using decision trees, neural networks, logistic regression, (ii) the literature review-based expert-defined variables, and (iii) common sense-based interaction variables, a consolidated set of factors is generated and used to develop Cox regression models for heart–lung graft survival. Results The predictive models’ performance in terms of 10-fold cross-validation accuracy rates for two multi-imputed datasets ranged from 79% to 86% for neural networks, from 78% to 86% for logistic regression, and from 71% to 79% for decision trees. The results indicate that the proposed integrated data mining methodology using Cox hazard models better predicted the graft survival with different variables than the conventional approaches commonly used in the literature. This result is validated by the comparison of the corresponding Gains charts for our proposed methodology and the literature review based Cox results, and by the comparison of Akaike information criteria (AIC) values received from each. Conclusions Data mining-based methodology proposed in this study reveals that there are undiscovered relationships (i.e. interactions of the existing variables) among the survival-related variables, which helps better predict the survival of the heart–lung transplants. It also brings a different set of variables into the scene to be evaluated by the domain-experts and be considered prior to the organ transplantation. }
}
@article{Small20141450,
  title = {Identifying emerging topics in science and technology },
  journal = {Research Policy },
  volume = {43},
  number = {8},
  pages = {1450 - 1467},
  year = {2014},
  note = {},
  issn = {0048-7333},
  doi = {http://dx.doi.org/10.1016/j.respol.2014.02.005},
  url = {http://www.sciencedirect.com/science/article/pii/S0048733314000298},
  author = {Henry Small and Kevin W. Boyack and Richard Klavans},
  keywords = {Citation-based modeling},
  keywords = {Emerging topics},
  keywords = {Scientific discovery},
  keywords = {Technological innovation},
  keywords = {Exogenous events },
  abstract = {Abstract The identification of emerging topics is of current interest to decision makers in both government and industry. Although many case studies present retrospective analyses of emerging topics, few studies actually nominate emerging topics for consideration by decision makers. We present a novel approach to identifying emerging topics in science and technology. Two large scale models of the scientific literature, one based on direct citation, and the other based on co-citation, are combined to nominate emerging topics using a difference function that rewards clusters that are new and growing rapidly. The top 25 emergent topics are identified for each year 2007 through 2010. These topics are classified and characterized in various ways in order to understand the motive forces behind their emergence, whether scientific discovery, technological innovation, or exogenous events. Topics are evaluated by searching for recent major awards associated with the topic or its key researchers. The evidence presented suggests that the methodology nominates a viable list of emerging topics suitable for inspection by decision makers. }
}
@article{Nie20094589,
  title = {Decision analysis of data mining project based on Bayesian risk },
  journal = {Expert Systems with Applications },
  volume = {36},
  number = {3, Part 1},
  pages = {4589 - 4594},
  year = {2009},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2008.05.014},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417408002170},
  author = {Guangli Nie and Lingling Zhang and Ying Liu and Xiuyu Zheng and Yong Shi},
  keywords = {Decision analysis},
  keywords = {Data mining},
  keywords = {Business intelligence },
  abstract = {Data mining, an efficient method of business intelligence, is a process to extract knowledge from large scale data. As the augment of the size of enterprise and the data, data mining as a way to make use of the data become more and more necessary. But now most of the literatures only focus on the algorithm itself. Few literatures research what qualification to fulfill before the decision doing data mining from the perspective of the company manager. This paper discusses the factors affect the data mining project. Based on the Bayesian risk, we build a model taking the risk attitude of the top executive in account to help them make decision whether to do data mining or not. }
}
@article{Rill201424,
  title = {PoliTwi: Early detection of emerging political topics on twitter and the impact on concept-level sentiment analysis },
  journal = {Knowledge-Based Systems },
  volume = {69},
  number = {0},
  pages = {24 - 33},
  year = {2014},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2014.05.008},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705114001920},
  author = {Sven Rill and Dirk Reinel and Jörg Scheidt and Roberto V. Zicari},
  keywords = {Topic detection},
  keywords = {Concept-level sentiment analysis},
  keywords = {Big data},
  keywords = {Twitter},
  keywords = {Social data analysis },
  abstract = {Abstract In this work, we present a system called PoliTwi, which was designed to detect emerging political topics (Top Topics) in Twitter sooner than other standard information channels. The recognized Top Topics are shared via different channels with the wider public. For the analysis, we have collected about 4,000,000 tweets before and during the parliamentary election 2013 in Germany, from April until September 2013. It is shown, that new topics appearing in Twitter can be detected right after their occurrence. Moreover, we have compared our results to Google Trends. We observed that the topics emerged earlier in Twitter than in Google Trends. Finally, we show how these topics can be used to extend existing knowledge bases (web ontologies or semantic networks) which are required for concept-level sentiment analysis. For this, we utilized special Twitter hashtags, called sentiment hashtags, used by the German community during the parliamentary election. }
}
@article{García20102044,
  title = {Advanced nonparametric tests for multiple comparisons in the design of experiments in computational intelligence and data mining: Experimental analysis of power },
  journal = {Information Sciences },
  volume = {180},
  number = {10},
  pages = {2044 - 2064},
  year = {2010},
  note = {Special Issue on Intelligent Distributed Information Systems },
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2009.12.010},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025509005404},
  author = {Salvador García and Alberto Fernández and Julián Luengo and Francisco Herrera},
  keywords = {Statistical analysis},
  keywords = {Computational intelligence},
  keywords = {Data mining},
  keywords = {Nonparametric statistics},
  keywords = {Multiple comparisons procedures},
  keywords = {Genetics-based machine learning},
  keywords = {Fuzzy classification systems },
  abstract = {Experimental analysis of the performance of a proposed method is a crucial and necessary task in an investigation. In this paper, we focus on the use of nonparametric statistical inference for analyzing the results obtained in an experiment design in the field of computational intelligence. We present a case study which involves a set of techniques in classification tasks and we study a set of nonparametric procedures useful to analyze the behavior of a method with respect to a set of algorithms, such as the framework in which a new proposal is developed. Particularly, we discuss some basic and advanced nonparametric approaches which improve the results offered by the Friedman test in some circumstances. A set of post hoc procedures for multiple comparisons is presented together with the computation of adjusted p-values. We also perform an experimental analysis for comparing their power, with the objective of detecting the advantages and disadvantages of the statistical tests described. We found that some aspects such as the number of algorithms, number of data sets and differences in performance offered by the control method are very influential in the statistical tests studied. Our final goal is to offer a complete guideline for the use of nonparametric statistical procedures for performing multiple comparisons in experimental studies. }
}
@article{Lin200911543,
  title = {Applying enhanced data mining approaches in predicting bank performance: A case of Taiwanese commercial banks },
  journal = {Expert Systems with Applications },
  volume = {36},
  number = {9},
  pages = {11543 - 11551},
  year = {2009},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2009.03.029},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417409002863},
  author = {Shih-Wei Lin and Yeou-Ren Shiue and Shih-Chi Chen and Hui-Miao Cheng},
  keywords = {Bank performance},
  keywords = {Data mining},
  keywords = {Particle swarm optimization},
  keywords = {Parameter optimization},
  keywords = {Feature selection },
  abstract = {The prediction of bank performance is an important issue. The bad performance of banks may first result in bankruptcy, which is expected to influence the economics of the country eventually. Since the early 1970s, many researchers had already made predictions on such issues. However, until recent years, most of them have used traditional statistics to build the prediction model. Because of the vigorous development of data mining techniques, many researchers have begun to apply those techniques to various fields, including performance prediction systems. However, data mining techniques have the problem of parameter settings. Therefore, this study applies particle swarm optimization (PSO) to obtain suitable parameter settings for support vector machine (SVM) and decision tree (DT), and to select a subset of beneficial features, without reducing the classification accuracy rate. In order to evaluate the proposed approaches, dataset collected from Taiwanese commercial banks are used as source data. The experimental results showed that the proposed approaches could obtain a better parameter setting, reduce unnecessary features, and improve the accuracy of classification significantly. }
}
@article{Hong20099747,
  title = {An effective mining approach for up-to-date patterns },
  journal = {Expert Systems with Applications },
  volume = {36},
  number = {6},
  pages = {9747 - 9752},
  year = {2009},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2009.02.029},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417409001675},
  author = {Tzung-Pei Hong and Yi-Ying Wu and Shyue-Liang Wang},
  keywords = {Data mining},
  keywords = {Temporal patterns},
  keywords = {Up-to-date patterns},
  keywords = {Lifetime },
  abstract = {Mining association rules is most commonly seen among the techniques for knowledge discovery from databases (KDD). It is used to discover relationships among items or itemsets. Furthermore, temporal data mining is concerned with the analysis of temporal data and the discovery of temporal patterns and regularities. In this paper, a new concept of up-to-date patterns is proposed, which is a hybrid of the association rules and temporal mining. An itemset may not be frequent (large) for an entire database but may be large up-to-date since the items seldom occurring early may often occur lately. An up-to-date pattern is thus composed of an itemset and its up-to-date lifetime, in which the user-defined minimum-support threshold must be satisfied. The proposed approach can mine more useful large itemsets than the conventional ones which discover large itemsets valid only for the entire database. Experimental results show that the proposed algorithm is more effective than the traditional ones in discovering such up-to-date temporal patterns especially when the minimum-support threshold is high. }
}
@article{TavakoliKashani201493,
  title = {A data mining approach to investigate the factors influencing the crash severity of motorcycle pillion passengers },
  journal = {Journal of Safety Research },
  volume = {51},
  number = {0},
  pages = {93 - 98},
  year = {2014},
  note = {},
  issn = {0022-4375},
  doi = {http://dx.doi.org/10.1016/j.jsr.2014.09.004},
  url = {http://www.sciencedirect.com/science/article/pii/S0022437514000929},
  author = {Ali Tavakoli Kashani and Rahim Rabieyan and Mohammad Mehdi Besharati},
  keywords = {Motorcycle pillion passengers},
  keywords = {Crash severity},
  keywords = {Classification and regression trees },
  abstract = {\{ABSTRACTIntroduction\} Motorcycle passengers comprise a considerable proportion of traffic crash victims. During a 5 year period (2006–2010) in Iran, an average of 3.4 pillion passengers are killed daily due to motorcycle crashes. This study investigated the main factors influencing crash severity of this group of road users. Method The Classification and Regression Trees (CART) method was employed to analyze the injury severity of pillion passengers in Iran over a 4 year period (2009–2012). Results The predictive accuracy of the model built with a total of 16 variables was 74%, which showed a considerable improvement compared to previous studies. The results indicate that area type, land use, and injured part of the body (head, neck, etc.) are the most influential factors affecting the fatality of motorcycle passengers. Results also show that helmet usage could reduce the fatality risk among motorcycle passengers by 28%. Practical Applications The findings of this study might help develop more targeted countermeasures to reduce the death rate of motorcycle pillion passengers. }
}
@article{Yeh20147730,
  title = {Exploring the dynamic model of the returns from value stocks and growth stocks using time series mining },
  journal = {Expert Systems with Applications },
  volume = {41},
  number = {17},
  pages = {7730 - 7743},
  year = {2014},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2014.06.036},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417414003777},
  author = {I-Cheng Yeh and Tzu-Kuang Hsu},
  keywords = {Growth stocks},
  keywords = {Value stocks},
  keywords = {Return rate},
  keywords = {Exponential decay model },
  abstract = {Abstract This study considered that value stocks and growth stocks are 2-dimensional concepts. We defined the book-to-market ratio as the value factor and the return on equity as the growth factor. We used these 2 factors to divide stocks into 4 types: high-value, low-value, high-growth, and low-growth stocks. Furthermore, we explored the change in stock prices and stock returns for these 4 categories before and after the formation of investment portfolios. We also established a dynamic model showing the returns from value stocks and growth stocks, called the exponential decay model. Finally, we used Taiwan Stock Exchange data to examine effectiveness of the model during the period from 1995 to 2009. The results are as follows: first, high-value stocks and low-value stocks exhibit a significantly over-reacting phenomenon. Second, high-growth stocks and low-growth stocks exhibit an obviously under-reacting phenomenon. Third, in each current quarter, high-value stocks exhibit the lowest returns; however, in the subsequent quarter, they have the highest returns, and then demonstrate a slow declining trend in the following quarters. These results showed that the stock market can exhibit a dramatic response to extraordinary information and proved that the stock market requires considerable time to correct themselves from an excessive reaction, thus high-value stocks exhibited a higher return. Fourth, in each current quarter, high-growth stocks had the highest return, followed by a rapidly decreasing trend in the following quarters. The t + 3 quarter returns were lower than those of low-growth stocks. This result demonstrated that the stock market does not exhibit an adequate reaction, but still remains rather efficient for routine financial information. Finally, regardless of value stocks or growth stocks, exponential decay models could accurately match with the data. }
}
@article{Jeong200911204,
  title = {Refining search results using a mining framework },
  journal = {Expert Systems with Applications },
  volume = {36},
  number = {8},
  pages = {11204 - 11210},
  year = {2009},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2009.02.074},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417409002322},
  author = {Ok-Ran Jeong and Eunseok Lee and Won Kim},
  keywords = {Refining search results},
  keywords = {Keyword-based search},
  keywords = {Mining framework},
  keywords = {Top terms },
  abstract = {Today’s major search engines return ranked search results that match the keywords the user specifies. There have been many proposals to rank the search results such that they match the user’s intentions and needs more closely. Despite good advances during the past decade, this problem still requires considerable research, as the number of search results has become ever larger. We define the collection of each search result and all the Web pages that are linked to the result as a search-result drilldown. We hypothesize that by mining and analyzing the top terms in the search-result drilldown of search results, it may be possible to make each search result more meaningful to the user, so that the user may select the desired search results with higher confidence. In this paper, we describe this technique, and show the results of preliminary validation work that we have done. }
}
@article{Seffernick200917,
  title = {Investigative mining of sequence data for novel enzymes: A case study with nitrilases },
  journal = {Journal of Biotechnology },
  volume = {143},
  number = {1},
  pages = {17 - 26},
  year = {2009},
  note = {},
  issn = {0168-1656},
  doi = {http://dx.doi.org/10.1016/j.jbiotec.2009.06.004},
  url = {http://www.sciencedirect.com/science/article/pii/S0168165609002636},
  author = {Jennifer L. Seffernick and Sudip K. Samanta and Tai Man Louie and Lawrence P. Wackett and Mani Subramanian},
  keywords = {Nitrilase},
  keywords = {Genome mining},
  keywords = {Mandelonitrile },
  abstract = {Mining sequence data is increasingly important for biocatalysis research. However, when relying on sequence data alone, prediction of the reaction catalyzed by a specific protein sequence is often elusive, and substrate specificity is far from trivial. The present study demonstrated an approach of combining sequence data and structures from distant homologs to target identification of new nitrilases that specifically utilize hindered nitrile substrates like mandelonitrile. A total of 212 non-identical target nitrilases were identified from GenBank. Evolutionary trace and sequence clustering methods were used combinatorily to identify a set of nitrilases with presumably distinct substrate specificities. Selected encoding genes were cloned into Escherichia coli. Recombinant E. coli expressing NitA (gi91784632) from Burkholderia xenovorans \{LB400\} was capable of growth on glutaronitrile or adiponitrile as the sole nitrogen source. Purified NitA exhibited highest activity with mandelonitrile, showing a catalytic efficiency (kcat/Km) of 3.6 × 104 M−1 s−1. A second nitrilase predicted from our studies from Bradyrhizobium zaponicum \{USDA\} 110 (gi27381513) was likewise shown to prefer mandelonitrile [Zhu, D., Mukherjee, C., Biehl, E.R., Hua, L., 2007. Discovery of a mandelonitrile hydrolase from Bradyrhizobium japonicum \{USDA110\} by rational genome mining. J. Biotechnol. 129 (4), 645–650]. Thus, predictions from sequence analysis and distant superfamily structures yielded enzyme activities with high selectivity for mandelonitrile. These data suggest that similar data mining techniques can be used to identify other substrate-specific enzymes from published, unannotated sequences. }
}
@article{Kavurucu200911418,
  title = {ILP-based concept discovery in multi-relational data mining },
  journal = {Expert Systems with Applications },
  volume = {36},
  number = {9},
  pages = {11418 - 11428},
  year = {2009},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2009.02.100},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417409002000},
  author = {Y. Kavurucu and P. Senkul and I.H. Toroslu},
  keywords = {\{ILP\}},
  keywords = {Data mining},
  keywords = {\{MRDM\}},
  keywords = {Concept discovery},
  keywords = {Aggregate predicate },
  abstract = {Multi-relational data mining has become popular due to the limitations of propositional problem definition in structured domains and the tendency of storing data in relational databases. Several relational knowledge discovery systems have been developed employing various search strategies, heuristics, language pattern limitations and hypothesis evaluation criteria, in order to cope with intractably large search space and to be able to generate high-quality patterns. In this work, an ILP-based concept discovery method, namely Confidence-based Concept Discovery (C2D), is described in which strong declarative biases and user-defined specifications are relaxed. Moreover, this new method directly works on relational databases. In addition to this, a new confidence-based pruning is used in this technique. We also describe how to define and use aggregate predicates as background knowledge in the proposed method. In order to use aggregate predicates, we show how to handle numerical attributes by using comparison operators on them. Finally, we analyze the effect of incorporating unrelated facts for generating transitive rules on the proposed method. A set of experiments are conducted on real-world problems to test the performance of the proposed method. }
}
@article{Liu20092902,
  title = {A self-organizing feature maps and data mining based decision support system for liability authentications of traffic crashes },
  journal = {Neurocomputing },
  volume = {72},
  number = {13–15},
  pages = {2902 - 2908},
  year = {2009},
  note = {Hybrid Learning Machines (HAIS 2007) / Recent Developments in Natural Computation (ICNC 2007) },
  issn = {0925-2312},
  doi = {http://dx.doi.org/10.1016/j.neucom.2008.06.032},
  url = {http://www.sciencedirect.com/science/article/pii/S0925231209001003},
  author = {Pei Liu},
  keywords = {Self-organizing feature maps},
  keywords = {Data mining},
  keywords = {\{CART\}},
  keywords = {Decision support system},
  keywords = {Traffic crashes},
  keywords = {Liability authentication },
  abstract = {This study develops a decision support tool for liability authentications of two-vehicle crashes based on generated self-organizing feature maps (SOM) and data mining (DM) models. Factors critical to liability attributions commonly identified theoretically and practically were first selected. Both \{SOM\} and \{DM\} models were then generated for frontal, side, and rear collisions of two-vehicle crashes. Appropriateness of all generated models was evaluated and confirmed. Finally, a decision support tool was developed using active server pages. Although with small data size, the decision support system was considered capable of giving reasonably good liability attributions and references on given cases. }
}
@article{Jiang20091034,
  title = {Mining globally distributed frequent subgraphs in a single labeled graph },
  journal = {Data & Knowledge Engineering },
  volume = {68},
  number = {10},
  pages = {1034 - 1058},
  year = {2009},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2009.04.008},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X09000585},
  author = {Xing Jiang and Hui Xiong and Chen Wang and Ah-Hwee Tan},
  keywords = {Frequent subgraph mining},
  keywords = {G-Measure},
  keywords = {G-Pattern },
  abstract = {Recent years have observed increasing efforts on graph mining and many algorithms have been developed for this purpose. However, most of the existing algorithms are designed for discovering frequent subgraphs in a set of labeled graphs only. Also, the few algorithms that find frequent subgraphs in a single labeled graph typically identify subgraphs appearing regionally in the input graph. In contrast, for real-world applications, it is commonly required that the identified frequent subgraphs in a single labeled graph should also be globally distributed. This paper thus fills this crucial void by proposing a new measure, termed G-Measure, to find globally distributed frequent subgraphs, called G-Patterns, in a single labeled graph. Specifically, we first show that the G-Patterns, selected by G-Measure, tend to be globally distributed in the input graph. Then, we present that G-Measure has the downward closure property, which guarantees the G-Measure value of a G-Pattern is not less than those of its supersets. Consequently, a G-Miner algorithm is developed for finding G-Patterns. Experimental results on four synthetic and seven real-world data sets and comparison with the existing algorithms demonstrate the efficacy of the G-Measure and the G-Miner for finding G-Patterns. Finally, an application of the G-Patterns is given. }
}
@article{Chen200946,
  title = {Mining fuzzy association rules from questionnaire data },
  journal = {Knowledge-Based Systems },
  volume = {22},
  number = {1},
  pages = {46 - 56},
  year = {2009},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2008.06.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705108001305},
  author = {Yen-Liang Chen and Cheng-Hsiung Weng},
  keywords = {Data mining},
  keywords = {Fuzzy sets},
  keywords = {Association rules},
  keywords = {Questionnaire data },
  abstract = {Association rule mining is one of most popular data analysis methods that can discover associations within data. Association rule mining algorithms have been applied to various datasets, due to their practical usefulness. Little attention has been paid, however, on how to apply the association mining techniques to analyze questionnaire data. Therefore, this paper first identifies the various data types that may appear in a questionnaire. Then, we introduce the questionnaire data mining problem and define the rule patterns that can be mined from questionnaire data. A unified approach is developed based on fuzzy techniques so that all different data types can be handled in a uniform manner. After that, an algorithm is developed to discover fuzzy association rules from the questionnaire dataset. Finally, we evaluate the performance of the proposed algorithm, and the results indicate that our method is capable of finding interesting association rules that would have never been found by previous mining algorithms. }
}
@article{Miao20097192,
  title = {AMAZING: A sentiment mining and retrieval system },
  journal = {Expert Systems with Applications },
  volume = {36},
  number = {3, Part 2},
  pages = {7192 - 7198},
  year = {2009},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2008.09.035},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417408006684},
  author = {Qingliang Miao and Qiudan Li and Ruwei Dai},
  keywords = {Sentiment retrieval},
  keywords = {Sentiment mining},
  keywords = {Temporal opinion quality},
  keywords = {Visualization},
  keywords = {Rank },
  abstract = {With the rapid growth of e-commerce, there are a great number of customer reviews on the e-commerce websites. Generally, potential customers usually wade through a lot of on-line reviews in order to make an informed decision. However, retrieving sentiment information relevant to customer’s interest still remains challenging. Developing a sentiment mining and retrieval system is a good way to overcome the problem of overloaded information in customer reviews. In this paper, we propose a sentiment mining and retrieval system which mines useful knowledge from consumer product reviews by utilizing data mining and information retrieval technology. A novel ranking mechanism taking temporal opinion quality (TOQ) and relevance into account is developed to meet customers’ information need. Besides the trend movement of customer reviews and the comparison between positive and negative evaluation are presented visually in the system. Experimental results on a real-world data set show the system is feasible and effective. }
}
@article{Lourenço2009710,
  title = {@Note: A workbench for Biomedical Text Mining },
  journal = {Journal of Biomedical Informatics },
  volume = {42},
  number = {4},
  pages = {710 - 720},
  year = {2009},
  note = {},
  issn = {1532-0464},
  doi = {http://dx.doi.org/10.1016/j.jbi.2009.04.002},
  url = {http://www.sciencedirect.com/science/article/pii/S1532046409000537},
  author = {Anália Lourenço and Rafael Carreira and Sónia Carneiro and Paulo Maia and Daniel Glez-Peña and Florentino Fdez-Riverola and Eugénio C. Ferreira and Isabel Rocha and Miguel Rocha},
  keywords = {Biomedical Text Mining},
  keywords = {Named Entity Recognition},
  keywords = {Information Retrieval},
  keywords = {Information Extraction},
  keywords = {Literature curation},
  keywords = {Semantic annotation},
  keywords = {Component-based software development },
  abstract = {Biomedical Text Mining (BioTM) is providing valuable approaches to the automated curation of scientific literature. However, most efforts have addressed the benchmarking of new algorithms rather than user operational needs. Bridging the gap between BioTM researchers and biologists’ needs is crucial to solve real-world problems and promote further research. We present @Note, a platform for BioTM that aims at the effective translation of the advances between three distinct classes of users: biologists, text miners and software developers. Its main functional contributions are the ability to process abstracts and full-texts; an information retrieval module enabling PubMed search and journal crawling; a pre-processing module with PDF-to-text conversion, tokenisation and stopword removal; a semantic annotation schema; a lexicon-based annotator; a user-friendly annotation view that allows to correct annotations and a Text Mining Module supporting dataset preparation and algorithm evaluation. @Note improves the interoperability, modularity and flexibility when integrating in-home and open-source third-party components. Its component-based architecture allows the rapid development of new applications, emphasizing the principles of transparency and simplicity of use. Although it is still on-going, it has already allowed the development of applications that are currently being used. }
}
@article{Chiu2014412,
  title = {Topic knowledge map and knowledge structure constructions with genetic algorithm, information retrieval, and multi-dimension scaling method },
  journal = {Knowledge-Based Systems },
  volume = {67},
  number = {0},
  pages = {412 - 428},
  year = {2014},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2014.03.008},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705114000914},
  author = {Deng-Yiv Chiu and Ya-Chen Pan},
  keywords = {Knowledge structure},
  keywords = {Topic knowledge map},
  keywords = {Information retrieval},
  keywords = {Genetic algorithm},
  keywords = {Independent chi-square},
  keywords = {Multi-dimension scaling },
  abstract = {Abstract This work presents a novel automated approach to construct topic knowledge maps with knowledge structures, followed by its application to an internationally renowned journal. Knowledge structures are diagrams showing the important components of knowledge in study. Knowledge maps identify the locations of objects and illustrate the relationship among objects. In our study, the important components derived from knowledge structures are used as objects to be spotted in a topic knowledge map. The purpose of our knowledge structures is to find out the major topics serving as subjects of article collections as well as related methods employed in the published papers. The purpose of topic knowledge maps is to transform high-dimensional objects (topic, paper, and cited frequency) into a 2-dimensional space to help understand complicated relatedness among high-dimensional objects, such as the related degree between an article and a topic. First, we adopt independent chi-square test to examine the independence of topics and apply genetic algorithm to choose topics selection with best fitness value to construct knowledge structures. Additionally, high-dimensional relationships among objects are transformed into a 2-dimensional space using the multi-dimension scaling method. The optimal transformation coordinate matrix is also determined by using a genetic algorithm to preserve the original relations among objects and construct appropriate topic knowledge maps. }
}
@article{Ding2009290,
  title = {TCOM, an innovative data structure for mining association rules among infrequent items },
  journal = {Computers & Mathematics with Applications },
  volume = {57},
  number = {2},
  pages = {290 - 301},
  year = {2009},
  note = {},
  issn = {0898-1221},
  doi = {http://dx.doi.org/10.1016/j.camwa.2008.09.044},
  url = {http://www.sciencedirect.com/science/article/pii/S0898122108006470},
  author = {Junfeng Ding and Stephen S.T. Yau},
  keywords = {\{TCOM\}},
  keywords = {Association rule mining},
  keywords = {Data mining},
  keywords = {Infrequent itemset},
  keywords = {Rule pattern },
  abstract = {Association rule mining is one of the most important areas in data mining, which has received a great deal of attention. The purpose of association rule mining is the discovery of association relationships or correlations among a set of items. In this paper, we present an efficient way to find the valid association rules among the infrequent items, which is seldom mentioned and whose importance often get ignored by other researchers. We design a new data structure, called Transactional Co-Occurrence Matrix, in short TCOM, by two passing of the original transactional database. Then the occurrence count of the itemsets and valid association rules will be mined based on TCOM, which combines the advantages of both transactional oriented (horizontal) layout and item oriented (vertical) layout of the database. It turns out that any itemsets could be randomly accessed and counted without full scan of either the original database or the TCOM, which significantly improves the efficiency of the mining processes. }
}
@article{Guo2009439,
  title = {Implement web learning environment based on data mining },
  journal = {Knowledge-Based Systems },
  volume = {22},
  number = {6},
  pages = {439 - 442},
  year = {2009},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2009.06.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705109000860},
  author = {Qinglin Guo and Ming Zhang},
  keywords = {Web-based learning},
  keywords = {Individual},
  keywords = {Data mining algorithm},
  keywords = {Learning technologies},
  keywords = {Data mining },
  abstract = {The need for providing learners with web-based learning content that match their accessibility needs and preferences, as well as providing ways to match learning content to user’s devices has been identified as an important issue in accessible educational environment. For a web-based open and dynamic learning environment, personalized support for learners becomes more important. In order to achieve optimal efficiency in a learning process, individual learner’s cognitive learning style should be taken into account. Due to different types of learners using these systems, it is necessary to provide them with an individualized learning support system. However, the design and development of web-based learning environments for people with special abilities has been addressed so far by the development of hypermedia and multimedia based on educational content. In this paper a framework of individual web-based learning system is presented by focusing on learner’s cognitive learning process, learning pattern and activities, as well as the technology support needed. Based on the learner-centered mode and cognitive learning theory, we demonstrate an online course design and development that supports the students with the learning flexibility and the adaptability. The proposed framework utilizes data mining algorithm for representing and extracting a dynamic learning process and learning pattern to support students’ deep learning, efficient tutoring and collaboration in web-based learning environment. And experiments do prove that it is feasible to use the method to develop an individual web-based learning system, which is valuable for further study in more depth. }
}
@incollection{Tzanetakis20141453,
  title = {Chapter 26 - Music Mining },
  editor = {Paulo S.R. Diniz, Johan A.K. Suykens, Rama Chellappa and Sergios Theodoridis},
  booktitle = {Academic Press Library in Signal Processing: Volume 1 Signal Processing Theory and Machine Learning},
  publisher = {Elsevier},
  year = {2014},
  volume = {1},
  pages = {1453 - 1492},
  series = {Academic Press Library in Signal Processing },
  issn = {2351-9819},
  doi = {http://dx.doi.org/10.1016/B978-0-12-396502-8.00026-7},
  url = {http://www.sciencedirect.com/science/article/pii/B9780123965028000267},
  author = {George Tzanetakis},
  abstract = {Abstract The multi-faceted nature of music information requires sophisticated algorithms and systems, that combine signal processing and machine learning techniques, in order to extract useful information from the large collections of music available today. This chapter overviews work in music mining which is the application of data mining techniques for the purposes of music processing. Topics covered include content-based similarity retrieval, genre classification, emotion/mood classification, music clustering, automatic tag annotation, audio fingerprinting, cover song detection as well as self-organizing maps and visualization. Open problems and future trends as well as pointers for further reading are also provided. }
}
@article{Romero2008368,
  title = {Data mining in course management systems: Moodle case study and tutorial },
  journal = {Computers & Education },
  volume = {51},
  number = {1},
  pages = {368 - 384},
  year = {2008},
  note = {},
  issn = {0360-1315},
  doi = {http://dx.doi.org/10.1016/j.compedu.2007.05.016},
  url = {http://www.sciencedirect.com/science/article/pii/S0360131507000590},
  author = {Cristóbal Romero and Sebastián Ventura and Enrique García},
  keywords = {Distance education and telelearning},
  keywords = {E-learning},
  keywords = {Evaluation of \{CAL\} systems},
  keywords = {Data mining},
  keywords = {Web mining },
  abstract = {Educational data mining is an emerging discipline, concerned with developing methods for exploring the unique types of data that come from the educational context. This work is a survey of the specific application of data mining in learning management systems and a case study tutorial with the Moodle system. Our objective is to introduce it both theoretically and practically to all users interested in this new research area, and in particular to online instructors and e-learning administrators. We describe the full process for mining e-learning data step by step as well as how to apply the main data mining techniques used, such as statistics, visualization, classification, clustering and association rule mining of Moodle data. We have used free data mining tools so that any user can immediately begin to apply data mining without having to purchase a commercial tool or program a specific personalized tool. }
}
@article{Tseng2009697,
  title = {Energy-efficient real-time object tracking in multi-level sensor networks by mining and predicting movement patterns },
  journal = {Journal of Systems and Software },
  volume = {82},
  number = {4},
  pages = {697 - 706},
  year = {2009},
  note = {Special Issue: Selected papers from the 2008 \{IEEE\} Conference on Software Engineering Education and Training (CSEET08) },
  issn = {0164-1212},
  doi = {http://dx.doi.org/10.1016/j.jss.2008.10.011},
  url = {http://www.sciencedirect.com/science/article/pii/S0164121208002215},
  author = {Vincent S. Tseng and Eric Hsueh-Chan Lu},
  keywords = {Sensor networks},
  keywords = {Location prediction},
  keywords = {Real-time object tracking},
  keywords = {Data mining },
  abstract = {A number of studies have been written on sensor networks in the past few years due to their wide range of potential applications. Object tracking is an important topic in sensor networks; and the limited power of sensor nodes presents numerous challenges to researchers. Previous studies of energy conservation in sensor networks have considered object movement behavior to be random. However, in some applications, the movement behavior of an object is often based on certain underlying events instead of randomness completely. Moreover, few studies have considered the real-time issue in addition to the energy saving problem for object tracking in sensor networks. In this paper, we propose a novel strategy named multi-level object tracking strategy (MLOT) for energy-efficient and real-time tracking of the moving objects in sensor networks by mining the movement log. In MLOT, we first conduct hierarchical clustering to form a hierarchical model of the sensor nodes. Second, the movement logs of the moving objects are analyzed by a data mining algorithm to obtain the movement patterns, which are then used to predict the next position of a moving object. We use the multi-level structure to represent the hierarchical relations among sensor nodes so as to achieve the goal of keeping track of moving objects in a real-time manner. Through experimental evaluation of various simulated conditions, the proposed method is shown to deliver excellent performance in terms of both energy efficiency and timeliness. }
}
@article{Li2009481,
  title = {Mining non-derivable frequent itemsets over data stream },
  journal = {Data & Knowledge Engineering },
  volume = {68},
  number = {5},
  pages = {481 - 498},
  year = {2009},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2009.01.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X09000020},
  author = {Haifeng Li and Hong Chen},
  keywords = {Stream},
  keywords = {Non-derivable frequent itemsets},
  keywords = {Data mining },
  abstract = {Non-derivable frequent itemsets are one of several condensed representations of frequent itemsets, which store all of the information contained in frequent itemsets using less space, thus being more suitable for stream mining. This paper considers a problem that to the best of our knowledge has not been addressed, namely, how to mine non-derivable frequent itemsets in an incremental fashion. We design a compact data structure named \{NDFIT\} to efficiently maintain a dynamically selected set of itemsets. In NDFIT, the nodes are divided into four categories to reduce the redundant computational cost based on their properties. Consequently, an optimized algorithm named \{NDFIoDS\} is proposed to generate non-derivable frequent itemsets over stream sliding window. Our experimental results show that this method is effective and more efficient than previous approaches. }
}
@article{Zhuang2009662,
  title = {Combining data mining and case-based reasoning for intelligent decision support for pathology ordering by general practitioners },
  journal = {European Journal of Operational Research },
  volume = {195},
  number = {3},
  pages = {662 - 675},
  year = {2009},
  note = {},
  issn = {0377-2217},
  doi = {http://dx.doi.org/10.1016/j.ejor.2007.11.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0377221707010806},
  author = {Zoe Y. Zhuang and Leonid Churilov and Frada Burstein and Ken Sikaris},
  keywords = {Decision support},
  keywords = {Data mining},
  keywords = {Case-based reasoning},
  keywords = {Data clustering},
  keywords = {Kohonen’s self-organizing maps},
  keywords = {Health care systems },
  abstract = {Pathology ordering by general practitioners (GPs) is a significant contributor to rising health care costs both in Australia and worldwide. A thorough understanding of the nature and patterns of pathology utilization is an essential requirement for effective decision support for pathology ordering. In this paper a novel methodology for integrating data mining and case-based reasoning for decision support for pathology ordering is proposed. It is demonstrated how this methodology can facilitate intelligent decision support that is both patient-oriented and deeply rooted in practical peer-group evidence. Comprehensive data collected by professional pathology companies provide a system-wide profile of patient-specific pathology requests by various \{GPs\} as opposed to that limited to an individual \{GP\} practice. Using the real data provided by \{XYZ\} Pathology Company in Australia that contain more than 1.5 million records of pathology requests by general practitioners (GPs), we illustrate how knowledge extracted from these data through data mining with Kohonen’s self-organizing maps constitutes the base that, with further assistance of modern data visualization tools and on-line processing interfaces, can provide “peer-group consensus” evidence support for solving new cases of pathology test ordering problem. The conclusion is that the formal methodology that integrates case-based reasoning principles which are inherently close to GPs’ daily practice, and data-driven computationally intensive knowledge discovery mechanisms which can be applied to massive amounts of the pathology requests data routinely available at professional pathology companies, can facilitate more informed evidential decision making by doctors in the area of pathology ordering. }
}
@article{Hung20097064,
  title = {An attentive self-organizing neural model for text mining },
  journal = {Expert Systems with Applications },
  volume = {36},
  number = {3, Part 2},
  pages = {7064 - 7071},
  year = {2009},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2008.08.037},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417408005903},
  author = {Chihli Hung and Yu-Liang Chi and Tsang-Yao Chen},
  keywords = {Attentive agent},
  keywords = {Web text mining},
  keywords = {Search engine},
  keywords = {Self-organizing map},
  keywords = {Personalized search },
  abstract = {This paper utilizes an attention concept approach in text mining to address the deficiencies of existing keyword search engines. We show how an attention concept in conjunction with a traditional search approach can be used to develop an adaptive text mining model with user-oriented, time-based and attentive knowledge. Without changing a user’s search behavior, this paper considers some specific post-search operations as attentive targets for building the personalized interest base. This interest base is further shown on an interest map via the self-organizing map algorithm (SOM). By comparing the personalized interest map, the original search results from a keyword search engine are re-ranked. Experimental results demonstrate that the attentive search mechanism is able to improve user satisfaction. }
}
@article{Kuivenhoven20141993,
  title = {Mining the genome for lipid genes },
  journal = {Biochimica et Biophysica Acta (BBA) - Molecular Basis of Disease },
  volume = {1842},
  number = {10},
  pages = {1993 - 2009},
  year = {2014},
  note = {From genome to function },
  issn = {0925-4439},
  doi = {http://dx.doi.org/10.1016/j.bbadis.2014.04.028},
  url = {http://www.sciencedirect.com/science/article/pii/S092544391400115X},
  author = {Jan Albert Kuivenhoven and Robert A. Hegele},
  keywords = {Lipoprotein},
  keywords = {Primary dyslipidemia},
  keywords = {Secondary dyslipidemia},
  keywords = {Gene discovery },
  abstract = {Abstract Mining of the genome for lipid genes has since the early 1970s helped to shape our understanding of how triglycerides are packaged (in chylomicrons), repackaged (in very low density lipoproteins; VLDL), and hydrolyzed, and also how remnant and low-density lipoproteins (LDL) are cleared from the circulation. Gene discoveries have also provided insights into high-density lipoprotein (HDL) biogenesis and remodeling. Interestingly, at least half of these key molecular genetic studies were initiated with the benefit of prior knowledge of relevant proteins. In addition, multiple important findings originated from studies in mouse, and from other types of non-genetic approaches. Although it appears by now that the main lipid pathways have been uncovered, and that only modulators or adaptor proteins such as those encoded by LDLRAP1, APOA5, ANGPLT3/4, and \{PCSK9\} are currently being discovered, genome wide association studies (GWAS) in particular have implicated many new loci based on statistical analyses; these may prove to have equally large impacts on lipoprotein traits as gene products that are already known. On the other hand, since 2004 – and particularly since 2010 when massively parallel sequencing has become de rigeur – no major new insights into genes governing lipid metabolism have been reported. This is probably because the etiologies of true Mendelian lipid disorders with overt clinical complications have been largely resolved. In the meantime, it has become clear that proving the importance of new candidate genes is challenging. This could be due to very low frequencies of large impact variants in the population. It must further be emphasized that functional genetic studies, while necessary, are often difficult to accomplish, making it hazardous to upgrade a variant that is simply associated to being definitively causative. Also, it is clear that applying a monogenic approach to dissect complex lipid traits that are mostly of polygenic origin is the wrong way to proceed. The hope is that large-scale data acquisition combined with sophisticated computerized analyses will help to prioritize and select the most promising candidate genes for future research. We suggest that at this point in time, investment in sequence technology driven candidate gene discovery could be recalibrated by refocusing efforts on direct functional analysis of the genes that have already been discovered. This article is part of a Special Issue entitled: From Genome to Function. }
}
@article{Lu20093536,
  title = {A study of applying data mining approach to the information disclosure for Taiwan’s stock market investors },
  journal = {Expert Systems with Applications },
  volume = {36},
  number = {2, Part 2},
  pages = {3536 - 3542},
  year = {2009},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2008.02.007},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417408001346},
  author = {Chi-Lin Lu and Ta-Cheng Chen},
  keywords = {Information disclosure},
  keywords = {Data mining},
  keywords = {Classification rule },
  abstract = {The financial literature and practices have shown the importance of corporate governance for decades, not only for firm’s management but also for investor protection. Information disclosure plays a key role in all of the governance mechanisms. With good information disclosure, the asymmetric information and the agency cost between the insider and the outsider of firms can be reduced effectively. However, the information disclosure status of listed companies is hard to be evaluated or judged by investors before the annual official announcement is reported in the next year. The main purpose of this study is to explore the hidden knowledge of information disclosure status among the listed companies in Taiwan’s stock market. In this paper, we employed decision tree-based mining techniques to explore the classification rules of information transparency levels of the listed firms in Taiwan’s stock market. Moreover, the multi-learner model constructed by boosting ensemble approach with decision tree algorithm has been applied. The numerical results show that the classification accuracy has been improved by using multi-leaner model in terms of less Type I and Type \{II\} errors. In particular, the extracted rules from the data mining approach can be developed as a computer model for the prediction or the classification of good/poor information disclosure potential and like expert systems. }
}
@article{Zhou201440,
  title = {Preference-based mining of top- influential nodes in social networks },
  journal = {Future Generation Computer Systems },
  volume = {31},
  number = {0},
  pages = {40 - 47},
  year = {2014},
  note = {Special Section: Advances in Computer Supported Collaboration: Systems and Technologies },
  issn = {0167-739X},
  doi = {http://dx.doi.org/10.1016/j.future.2012.06.011},
  url = {http://www.sciencedirect.com/science/article/pii/S0167739X12001471},
  author = {Jingyu Zhou and Yunlong Zhang and Jia Cheng},
  keywords = {User preference},
  keywords = {Influence maximization},
  keywords = {\{SVD\}},
  keywords = {Collaborative filtering},
  keywords = {Social network },
  abstract = {Many important applications can be generalized as the influence maximization problem, which targets finding a K -node set in a social network that has the maximum influence. Previous work only considers that influence is propagated through the network with a uniform probability. However, because users actually have different preferences on topics, such a uniform propagation can result in inaccurate results. To solve this problem, we have designed a two-stage mining algorithm (GAUP) to mine the most influential nodes in a network on a given topic. Given a set of users’ documents labeled with topics, \{GAUP\} first computes user preferences with a latent feature model based on \{SVD\} or a model based on vector space. Then to find top- K nodes in the second stage, \{GAUP\} adopts a greedy algorithm that is guaranteed to find a solution within 63% of the optimal. Our evaluation on the task of expert finding shows that \{GAUP\} performs better than the state-of-the-art greedy algorithm, SVD-based collaborative filtering, and HITS. }
}
@article{KraljNovak2009113,
  title = {CSM-SD: Methodology for contrast set mining through subgroup discovery },
  journal = {Journal of Biomedical Informatics },
  volume = {42},
  number = {1},
  pages = {113 - 122},
  year = {2009},
  note = {},
  issn = {1532-0464},
  doi = {http://dx.doi.org/10.1016/j.jbi.2008.08.007},
  url = {http://www.sciencedirect.com/science/article/pii/S1532046408001032},
  author = {Petra Kralj Novak and Nada Lavrač and Dragan Gamberger and Antonija Krstačić},
  keywords = {Contrast set mining},
  keywords = {Subgroup discovery},
  keywords = {Supporting factors},
  keywords = {Descriptive rules},
  keywords = {Brain ischemia },
  abstract = {This paper addresses a data analysis task, known as contrast set mining, whose goal is to find differences between contrasting groups. As a methodological novelty, it is shown that this task can be effectively solved by transforming it to a more common and well-understood subgroup discovery task. The transformation is studied in two learning settings, a one-versus-all and a pairwise contrast set mining setting, uncovering the conditions for each of the two choices. Moreover, the paper shows that the explanatory potential of discovered contrast sets can be improved by offering additional contrast set descriptors, called the supporting factors. The proposed methodology has been applied to uncover distinguishing characteristics of two groups of brain stroke patients, both with rapidly developing loss of brain function due to ischemia:those with ischemia caused by thrombosis and by embolism, respectively. }
}
@article{Liu2009972,
  title = {Mining the change of event trends for decision support in environmental scanning },
  journal = {Expert Systems with Applications },
  volume = {36},
  number = {2, Part 1},
  pages = {972 - 984},
  year = {2009},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2007.10.016},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417407005416},
  author = {Duen-Ren Liu and Meng-Jung Shih and Churn-Jung Liau and Chin-Hui Lai},
  keywords = {Information processing and management},
  keywords = {Association rule mining},
  keywords = {Change mining},
  keywords = {Environmental scanning},
  keywords = {Event detection},
  keywords = {Event tracking },
  abstract = {As the business environment has become increasingly complex, the demand for environmental scanning to assist company managers plan strategies and responses has grown significantly. The conventional technique for supporting environmental scanning is event detection from text documents such as news stories. Event detection methods recognize events, but neglect to discover the changes brought about by the events. In this work, we propose an event change detection (ECD) approach that combines association rule mining and change mining techniques. The approach detects changes caused by events to help managers respond rapidly to changes in the external environment. Association rule mining is used to discover event trends (the subject patterns of events) from news stories. The changes can be identified by comparing event trends in different time periods. The empirical evaluation showed that the discovered event changes can support decision-makers by providing up-to-date information about the business environment, which enables them to make appropriate decisions. The proposed approach is practical for business managers to be aware of environmental changes and adjust their business strategies accordingly. }
}
@article{Boland20091064,
  title = {LP-based disaggregation approaches to solving the open pit mining production scheduling problem with block processing selectivity },
  journal = {Computers & Operations Research },
  volume = {36},
  number = {4},
  pages = {1064 - 1089},
  year = {2009},
  note = {},
  issn = {0305-0548},
  doi = {http://dx.doi.org/10.1016/j.cor.2007.12.006},
  url = {http://www.sciencedirect.com/science/article/pii/S0305054807002663},
  author = {Natashia Boland and Irina Dumitrescu and Gary Froyland and Ambros M. Gleixner},
  keywords = {Open pit mining},
  keywords = {Constrained scheduling problems},
  keywords = {Net present value},
  keywords = {Aggregation},
  keywords = {Iterative disaggregation},
  keywords = {Mixed integer programming },
  abstract = {Given a discretisation of an orebody as a block model, the open pit mining production scheduling problem (OPMPSP) consists of finding the sequence in which the blocks should be removed from the pit, over the lifetime of the mine, such that the net present value (NPV) of the operation is maximised. In practice, due to the large number of blocks and precedence constraints linking them, blocks are typically aggregated to form larger scheduling units. We aim to solve the OPMPSP, formulated as a mixed integer programme (MIP), so that aggregates are used to schedule the mining process, while individual blocks are used for processing decisions. We propose an iterative disaggregation method that refines the aggregates (with respect to processing) up to the point where the refined aggregates defined for processing produce the same optimal solution for the linear programming (LP) relaxation of the \{MIP\} as the optimal solution of the \{LP\} relaxation with individual block processing. We propose several strategies of creating refined aggregates for the \{MIP\} processing, using duality results and exploiting the problem structure. These refined aggregates allow the solution of very large problems in reasonable time with very high solution quality in terms of NPV. }
}
@article{Duan2009400,
  title = {Classification and evaluation of timed running schemas for workflow based on process mining },
  journal = {Journal of Systems and Software },
  volume = {82},
  number = {3},
  pages = {400 - 410},
  year = {2009},
  note = {},
  issn = {0164-1212},
  doi = {http://dx.doi.org/10.1016/j.jss.2008.07.007},
  url = {http://www.sciencedirect.com/science/article/pii/S0164121208001799},
  author = {Hua Duan and Qingtian Zeng and Huaiqing Wang and Sherry X. Sun and Dongming Xu},
  keywords = {Workflow},
  keywords = {Running logs},
  keywords = {Running schema},
  keywords = {Classification},
  keywords = {Evaluation},
  keywords = {Process mining},
  keywords = {Petri net },
  abstract = {The system running logs of a workflow contain much information about the behavior and logical structure between activities. In this paper, a mining approach is proposed to discover the structural and temporal model for a workflow from its timed running logs. The mining results are represented in the formalized form of Petri nets extended with two timing factors that allows validation or verification the actual behaviors, especially the temporal constraints between activities. According to the reachability graph of the extended Petri net model mined, all running schemas of a workflow can be generated, which defines the temporal constraints between running activities. By calculating the earliest and latest start time of each activity, the earliest starting and latest existing time of each state in the running schema can be determined. Based on the temporal relations between the timing factors of each running state, the running schemas can be classified into six classes. The effects of the six classes of running schemas on the implementation of the whole workflow are evaluated so as to obtain the best one that can ensure the workflow is finished in the shortest time. The standards for the ideal, reliable and favorable running schemas and their existence conditions are discussed, which can be used to evaluate the running logs and control the future running of a workflow. }
}
@article{Shih20095523,
  title = {Applying hybrid data mining techniques to web-based self-assessment system of Study and Learning Strategies Inventory },
  journal = {Expert Systems with Applications },
  volume = {36},
  number = {3, Part 1},
  pages = {5523 - 5532},
  year = {2009},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2008.06.089},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417408004193},
  author = {Chien-Chou Shih and Ding-An Chiang and Sheng-Wei Lai and Yen-Wei Hu},
  keywords = {Data mining},
  keywords = {Association rule},
  keywords = {Decision tree},
  keywords = {Self-assessment},
  keywords = {\{LASSI\} },
  abstract = {Traditional assessment tools, such as “Learning and Study Strategy Scale Inventory (LASSI)”, are typically pen-and-paper tests that require responses to a multitude of questions. This may easily lead to student’s resistance, fatigue and unwillingness to complete the assessment. To improve the situation, a hybrid data mining technique was applied to analyze the \{LASSI\} surveys of freshmen students at Tamkang University. The most significant contribution of this research is in dynamically reducing the number of questions while the \{LASSI\} assessment is proceeding. To verify the appliance of the proposed method, a web-based \{LASSI\} self-assessment system (Web-LSA) was developed. This system can be used as a guide to determine study disturbances for high-risk groups, and can provide counselors with fundamental information on which to base follow-up counseling services to its users. }
}
@article{Liao20094967,
  title = {Mining information users’ knowledge for one-to-one marketing on information appliance },
  journal = {Expert Systems with Applications },
  volume = {36},
  number = {3, Part 1},
  pages = {4967 - 4979},
  year = {2009},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2008.06.020},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417408003394},
  author = {Shu-Hsien Liao and Chyuan-Meei Chen and Chia-Lin Hsieh and Shih-Chung Hsiao},
  keywords = {Information appliance},
  keywords = {One-to-one marketing},
  keywords = {Ontology},
  keywords = {Data-mining},
  keywords = {Association rules},
  keywords = {Clustering analysis},
  keywords = {Classification and regression trees (CART)},
  keywords = {Knowledge extraction },
  abstract = {All kinds of information technologies have been converging rapidly in recent few years from simple traditional computers to diversified multimedia information appliances, creating unprecedented technologies and devices, such as personal digital assistants (PDAs), smart cell phones, portable media players (PMPs), online console games, and set top boxes, among others. These electronic functionalities converged devices with powerful contents and functions, such as the World Wide Web, videoconferencing, e-mail, internet telephony, online gaming, digital television, and net banking, are easier to use than traditional computers but not less capable of performing daily tasks. These information technology revolutions along with rapid growing of network technology not only increased the amount of internet applications and digital contents, but also led to diversified consumer behaviors, increased competition, and opportunities. On the other hand, one-to-one marketing is different from traditional marketing methods because it focuses on customer satisfaction and is customer-oriented rather than focusing on marketing mass consumers; thus a one-to-one marketer tries to find more different products and services for the same customer. Therefore, how to establish potential cross-selling and one-to-one offers through product mix analysis, enhance relationship with customers by means of personalized offers through product knowledge, and understand users’ needs and making useful suggestions for new product developments and one-to-one marketing become critical issues to information appliance firms. This paper proposes association rules, clustering analysis and \{CART\} as methodologies of data-mining, which is implemented for mining product and marketing knowledge from information users. Knowledge extraction from information users is illustrated as knowledge patterns, rules, clusters, and trees in order to propose suggestions on one-to-one marketing for information appliance firms. }
}
@article{Tsoi2009824,
  title = {Text-mining approach to evaluate terms for ontology development },
  journal = {Journal of Biomedical Informatics },
  volume = {42},
  number = {5},
  pages = {824 - 830},
  year = {2009},
  note = {Biomedical Natural Language Processing },
  issn = {1532-0464},
  doi = {http://dx.doi.org/10.1016/j.jbi.2009.03.009},
  url = {http://www.sciencedirect.com/science/article/pii/S1532046409000483},
  author = {Lam C. Tsoi and Ravi Patel and Wenle Zhao and W. Jim Zheng},
  keywords = {Ontology development},
  keywords = {Hypergeometric test},
  keywords = {PubMed},
  keywords = {Text mining },
  abstract = {Developing ontologies to account for the complexity of biological systems requires the time intensive collaboration of many participants with expertise in various fields. While each participant may contribute to construct a list of terms for ontology development, no objective methods have been developed to evaluate how relevant each of these terms is to the intended domain. We have developed a computational method based on a hypergeometric enrichment test to evaluate the relevance of such terms to the intended domain. The proposed method uses the PubMed literature database to evaluate whether each potential term for ontology development is overrepresented in the abstracts that discuss the particular domain. This evaluation provides an objective approach to assess terms and prioritize them for ontology development. }
}
@article{Hamrouni20091091,
  title = {Sweeping the disjunctive search space towards mining new exact concise representations of frequent itemsets },
  journal = {Data & Knowledge Engineering },
  volume = {68},
  number = {10},
  pages = {1091 - 1111},
  year = {2009},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2009.05.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X09000718},
  author = {T. Hamrouni and S. Ben Yahia and E. Mephu Nguifo},
  keywords = {Data mining},
  keywords = {Frequent itemset},
  keywords = {Association rule},
  keywords = {Concise representation},
  keywords = {Complementary occurrence},
  keywords = {Disjunctive support},
  keywords = {Disjunctive search space},
  keywords = {Closure operator},
  keywords = {Equivalence class},
  keywords = {Disjunctive closed itemset},
  keywords = {Essential itemset},
  keywords = {Generalized association rule},
  keywords = {Minimum description length principle },
  abstract = {Concise (or condensed) representations of frequent patterns follow the minimum description length (MDL) principle, by providing the shortest description of the whole set of frequent patterns. In this work, we introduce a new exact concise representation of frequent itemsets. This representation is based on an exploration of the disjunctive search space. The disjunctive itemsets convey information about the complementary occurrence of items in a dataset. A novel closure operator is then devised to suit the characteristics of the explored search space. The proposed operator aims at mapping many disjunctive itemsets to a unique one, called a disjunctive closed itemset. Hence, it permits to drastically reduce the number of handled itemsets within the targeted re-presentation. Interestingly, the proposed representation offers direct access to the disjunctive and negative supports of frequent itemsets while ensuring the derivation of their exact conjunctive supports. We conclude from the experimental results reported and discussed here that our representation is effective and sound in comparison with different other concise representations. }
}
@article{Liao20081338,
  title = {Mining marketing maps for business alliances },
  journal = {Expert Systems with Applications },
  volume = {35},
  number = {3},
  pages = {1338 - 1350},
  year = {2008},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2007.08.052},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417407003478},
  author = {Shu-Hsien Liao and Wen-Jung Chang and Chai-Chen Lee},
  keywords = {Business alliance},
  keywords = {Marketing maps},
  keywords = {Database marketing},
  keywords = {Data mining},
  keywords = {Association rules},
  keywords = {Knowledge extraction },
  abstract = {A business can strengthen its competitive advantage and increase its market share by forming a strategic alliance. With the help of alliances, businesses can bring to bear significant resources beyond the capabilities of the individual co-operating firms. Thus how to effectively evaluate and select alliance partners is an important task for businesses because a successful corporation partner selection can therefore reduce the possible risk and avoid failure results on business alliance. This paper proposes the Apriori algorithm as a methodology of association rules for data mining, which is implemented for mining marketing map knowledge from customers. Knowledge extraction from marketing maps is illustrated as knowledge patterns and rules in order to propose suggestions for business alliances and possible co-operation solutions. Finally, this study suggests that integration of different research factors, variables, theories, and methods for investigating this research topic of business alliance could improve research results and scope. }
}
@incollection{Arentze2009325,
  title = {Spatial Data Mining, Cluster and Pattern Recognition },
  editor = {Thrift, Rob KitchinNigel },
  booktitle = {International Encyclopedia of Human Geography },
  publisher = {Elsevier},
  edition = {},
  address = {Oxford},
  year = {2009},
  pages = {325 - 331},
  isbn = {978-0-08-044910-4},
  doi = {http://dx.doi.org/10.1016/B978-008044910-4.00524-1},
  url = {http://www.sciencedirect.com/science/article/pii/B9780080449104005241},
  author = {T.A. Arentze},
  keywords = {Classification},
  keywords = {Cluster analysis},
  keywords = {Geographic analysis},
  keywords = {Knowledge discovery},
  keywords = {Machine learning},
  keywords = {Pattern recognition},
  keywords = {Spatial data mining},
  keywords = {Spatial databases},
  keywords = {Supervised learning},
  keywords = {Unsupervised learning },
  abstract = {Extracting meaningful patterns from large databases is a relevant task in several areas of geographic research such as the interpretation of satellite images, the study of dispersion of spatial phenomena (e.g., diseases, crime), and classification of space–time behavior of individuals, to name a few. This article discusses the techniques and issues involved in spatial data mining for cluster detection and pattern recognition. The techniques range from inductive machine learning algorithms to numerical cluster detection techniques. Irrespective of the technique used, a number of issues require attention in any spatial data-mining task. These include validity testing, the selection of relevant features, interpretation of patterns, and treatment of spatial data. Approaches developed to address these issues are discussed in this article as well. }
}
@article{Medelyan2009716,
  title = {Mining meaning from Wikipedia },
  journal = {International Journal of Human-Computer Studies },
  volume = {67},
  number = {9},
  pages = {716 - 754},
  year = {2009},
  note = {},
  issn = {1071-5819},
  doi = {http://dx.doi.org/10.1016/j.ijhcs.2009.05.004},
  url = {http://www.sciencedirect.com/science/article/pii/S1071581909000561},
  author = {Olena Medelyan and David Milne and Catherine Legg and Ian H. Witten},
  keywords = {Wikipedia},
  keywords = {Text mining},
  keywords = {Wikipedia mining},
  keywords = {\{NLP\}},
  keywords = {Information retrieval},
  keywords = {Information extraction},
  keywords = {Ontologies},
  keywords = {Semantic web },
  abstract = {Wikipedia is a goldmine of information; not just for its many readers, but also for the growing community of researchers who recognize it as a resource of exceptional scale and utility. It represents a vast investment of manual effort and judgment: a huge, constantly evolving tapestry of concepts and relations that is being applied to a host of tasks. This article provides a comprehensive description of this work. It focuses on research that extracts and makes use of the concepts, relations, facts and descriptions found in Wikipedia, and organizes the work into four broad categories: applying Wikipedia to natural language processing; using it to facilitate information retrieval and information extraction; and as a resource for ontology building. The article addresses how Wikipedia is being used as is, how it is being improved and adapted, and how it is being combined with other structures to create entirely new resources. We identify the research groups and individuals involved, and how their work has developed in the last few years. We provide a comprehensive list of the open-source software they have produced. }
}
@article{Lee20093459,
  title = {Mining students’ behavior in web-based learning programs },
  journal = {Expert Systems with Applications },
  volume = {36},
  number = {2, Part 2},
  pages = {3459 - 3464},
  year = {2009},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2008.02.054},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417408001115},
  author = {Man Wai Lee and Sherry Y. Chen and Kyriacos Chrysostomou and Xiaohui Liu},
  keywords = {Cognitive styles},
  keywords = {Data mining},
  keywords = {Field dependence},
  keywords = {Web-based learning },
  abstract = {There has been a proliferation of web-based learning programs (WBLPs). Unlike traditional computer-based learning programs, \{WBLPs\} are used by a population of learners who have diverse background. How different learners access the \{WBLPs\} has been investigated by several studies, which indicate that cognitive style is an important factor that influences learners’ preferences. However, these studies mainly use statistical methods to analyze learners’ preferences. In this paper, we propose to analyze learners’ preferences with a data mining technique. Findings in our study show that Field Independent learners frequently use backward/forward buttons and spent less time for navigation. On the other hand, Field Dependent learners often use main menu and have more repeated visiting. Implications for these findings are discussed. }
}
@article{Chu20081105,
  title = {An efficient algorithm for mining temporal high utility itemsets from data streams },
  journal = {Journal of Systems and Software },
  volume = {81},
  number = {7},
  pages = {1105 - 1117},
  year = {2008},
  note = {},
  issn = {0164-1212},
  doi = {http://dx.doi.org/10.1016/j.jss.2007.07.026},
  url = {http://www.sciencedirect.com/science/article/pii/S0164121207001859},
  author = {Chun-Jung Chu and Vincent S. Tseng and Tyne Liang},
  keywords = {Utility mining},
  keywords = {Temporal high utility itemsets},
  keywords = {Data stream mining},
  keywords = {Association rules },
  abstract = {Utility of an itemset is considered as the value of this itemset, and utility mining aims at identifying the itemsets with high utilities. The temporal high utility itemsets are the itemsets whose support is larger than a pre-specified threshold in current time window of the data stream. Discovery of temporal high utility itemsets is an important process for mining interesting patterns like association rules from data streams. In this paper, we propose a novel method, namely \{THUI\} (Temporal High Utility Itemsets)-Mine, for mining temporal high utility itemsets from data streams efficiently and effectively. To the best of our knowledge, this is the first work on mining temporal high utility itemsets from data streams. The novel contribution of THUI-Mine is that it can effectively identify the temporal high utility itemsets by generating fewer candidate itemsets such that the execution time can be reduced substantially in mining all high utility itemsets in data streams. In this way, the process of discovering all temporal high utility itemsets under all time windows of data streams can be achieved effectively with less memory space and execution time. This meets the critical requirements on time and space efficiency for mining data streams. Through experimental evaluation, THUI-Mine is shown to significantly outperform other existing methods like Two-Phase algorithm under various experimental conditions. }
}
@article{Gifford2008340,
  title = {Toward a theory of local legitimacy by \{MNEs\} in developing nations: Newmont mining and health sustainable development in Peru },
  journal = {Journal of International Management },
  volume = {14},
  number = {4},
  pages = {340 - 352},
  year = {2008},
  note = {The Role of Corporate Social and Environmental Responsibility in International Business },
  issn = {1075-4253},
  doi = {http://dx.doi.org/10.1016/j.intman.2007.09.005},
  url = {http://www.sciencedirect.com/science/article/pii/S1075425308000720},
  author = {Blair Gifford and Andrew Kestler},
  keywords = {Institutional theory},
  keywords = {Sustainable development},
  keywords = {Corporate social responsibility},
  keywords = {Mining },
  abstract = {This paper describes a current initiative by Newmont Mining Corporation (Newmont) to develop sustainable community benefit in communities around its mining operations in Peru in response to heightened criticism of Newmont by non-government organizations and the media. Using anthropologically oriented methods, a community health assessment project in an area of projected mining is described in detail in this paper. This case adds to London and Hart's social embeddedness strategy for multi-national enterprises (MNEs) working in developing nations by introducing a locally-based community interaction model, which we describe as a local legitimacy strategy, in an effort to bring about sustainable development in the communities that surround a MNE's production activities. The components of our local legitimacy strategy include co-analysis of community needs by \{MNEs\} and community partners, and planning and investment in developments to enhance the social fabric and the physical infrastructure needs of communities. The developing world is getting better at publicizing and monitoring the work of MNEs. We argue that it will be increasingly necessary for MNEs, like Newmont, to add local sustainable benefit into their strategic mix to gain the social license and legitimacy that is needed to operate in poorer communities. }
}
@article{Poon20092213,
  title = {Augmenting productivity analysis with data mining: An application on \{IT\} business value },
  journal = {Expert Systems with Applications },
  volume = {36},
  number = {2, Part 1},
  pages = {2213 - 2224},
  year = {2009},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2007.12.028},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417407006434},
  author = {Simon K. Poon and Joseph G. Davis and Byounggu Choi},
  keywords = {Association rules mining},
  keywords = {Complementarities},
  keywords = {Information technology business value},
  keywords = {Productivity paradox },
  abstract = {In this paper we use a large firm-level dataset to extend previous studies by augmenting the endogenous growth accounting framework with a data mining technique to analyze the complex relationships between the use of \{IT\} and organizational practices. There is emerging evidence of recent emphasis on organizational factors and a greater shift towards “IT complementarities” in which value addition is linked to combining complementary organizational practices with \{IT\} investments. Our findings indicate that the set of interrelated organizational practices that complement positively to \{IT\} use is different from the set of practices hindering \{IT\} use. The presence of clustering among organizational practices clearly implies that some combinations of practices make it difficult to precisely empirical examine. We have found that our technique was able to show some organizational factors may have different pathways to affect organizational performance and such organizational practices have often been overlooked but can play a weak yet non-trivial role in production and organizational processes. }
}
@article{Petrič2009219,
  title = {Literature mining method RaJoLink for uncovering relations between biomedical concepts },
  journal = {Journal of Biomedical Informatics },
  volume = {42},
  number = {2},
  pages = {219 - 227},
  year = {2009},
  note = {},
  issn = {1532-0464},
  doi = {http://dx.doi.org/10.1016/j.jbi.2008.08.004},
  url = {http://www.sciencedirect.com/science/article/pii/S1532046408001044},
  author = {Ingrid Petrič and Tanja Urbančič and Bojan Cestnik and Marta Macedoni-Lukšič},
  keywords = {Literature mining},
  keywords = {Knowledge discovery},
  keywords = {Hypotheses generation},
  keywords = {Biomedical articles},
  keywords = {Autism },
  abstract = {To support biomedical experts in their knowledge discovery process, we have developed a literature mining method called RaJoLink for identification of relations between biomedical concepts in disconnected sets of articles. The method implements Swanson’s \{ABC\} model approach for generating hypotheses in a new way. The main novelty is a semi-automated suggestion of candidates for agents a that might be logically connected with a given phenomenon c under investigation. The choice of candidates for a is based on rare terms identified in the literature on c. As rare terms are not part of the typical range of information, which describe the phenomenon under investigation, such information might be considered as unusual observations about the phenomenon c. If literatures on these rare terms have an interesting term in common, this joint term is declared as a candidate for a. Linking terms b between literature on a and literature on c are then searched for in the closed discovery to provide additional supportive evidence for uncovered connections. We have applied the method to the literature on autism and have used \{MEDLINE\} as a source of data. Expert evaluation has confirmed that the discovered relations might contribute to a better understanding of autism. }
}
@incollection{Džeroski2008821,
  title = {Data Mining },
  editor = {Fath, Sven Erik JørgensenBrian D. },
  booktitle = {Encyclopedia of Ecology },
  publisher = {Academic Press},
  edition = {},
  address = {Oxford},
  year = {2008},
  pages = {821 - 830},
  isbn = {978-0-08-045405-4},
  doi = {http://dx.doi.org/10.1016/B978-008045405-4.00153-1},
  url = {http://www.sciencedirect.com/science/article/pii/B9780080454054001531},
  author = {S. Džeroski},
  keywords = {Classification},
  keywords = {Classification rules},
  keywords = {Data mining (DM)},
  keywords = {DM applications in ecological modeling},
  keywords = {Decision trees},
  keywords = {Equation discovery},
  keywords = {Habitat suitability},
  keywords = {Knowledge discovery indatabases (KDD)},
  keywords = {Population dynamics },
  abstract = {Data mining, the central activity in the process of knowledge discovery in databases (KDD), is concerned with finding patterns in data. This article introduces and illustrates the most common types of patterns considered by data mining approaches and gives rough outlines of the data mining algorithms that are most frequently used to look for such patterns. It also provides an overview of \{KDD\} applications in environmental sciences, complemented with a sample of case studies. The application domains addressed mostly concern ecological modeling. }
}
@article{WANG2008168,
  title = {Extracting mining subsidence land from remote sensing images based on domain knowledge },
  journal = {Journal of China University of Mining and Technology },
  volume = {18},
  number = {2},
  pages = {168 - 181},
  year = {2008},
  note = {},
  issn = {1006-1266},
  doi = {http://dx.doi.org/10.1016/S1006-1266(08)60036-X},
  url = {http://www.sciencedirect.com/science/article/pii/S100612660860036X},
  author = {Xing-feng WANG and Yun-jia WANG and Tai HUANG},
  keywords = {remote sensing},
  keywords = {mining subsidence land},
  keywords = {domain knowledge},
  keywords = {Luan mining area},
  keywords = {\{GIS\} },
  abstract = {Extracting mining subsidence land from \{RS\} images is one of important research contents for environment monitoring in mining area. The accuracy of traditional extracting models based on spectral features is low. In order to extract subsidence land from \{RS\} images with high accuracy, some domain knowledge should be imported and new models should be proposed. This paper, in terms of the disadvantage of traditional extracting models, imports domain knowledge from practice and experience, converts semantic knowledge into digital information, and proposes a new model for the specific task. By selecting Luan mining area as study area, this new model is tested based on \{GIS\} and related knowledge. The result shows that the proposed method is more precise than traditional methods and can satisfy the demands of land subsidence monitoring in mining area. }
}
@article{Yang20099709,
  title = {Automatic generation of semantically enriched web pages by a text mining approach },
  journal = {Expert Systems with Applications },
  volume = {36},
  number = {6},
  pages = {9709 - 9718},
  year = {2009},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2009.02.022},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417409001729},
  author = {Hsin-Chang Yang},
  keywords = {Metadata generation},
  keywords = {Semantic tagging},
  keywords = {Text mining},
  keywords = {Self-organizing map },
  abstract = {Nowadays most of the Web pages contain little amount of structure and supporting information that can reveal their semantics or meanings. To enable automated processing of the Web pages, semantic information such as metadata and tags regarding to each page should be added to it. Several authoring tools have been developed to help users tackling this task. However, manual or semi-automatic authoring is implausible when we intend to annotate large amount of Web pages. In this work, we proposed a method to automatically generate some descriptive metadata and tags for a Web page. The idea is to apply the self-organizing map algorithm to cluster the Web pages and discover the relationships between these clusters. In the mean time, the themes of each cluster are also identified. We then use such relationships and themes to tag the Web pages and generate metadata for the Web pages. The result of experiments shows that our method may generate semantically relevant metadata and tags for the Web pages. }
}
@article{Chen2008581,
  title = {Machine learning techniques for business blog search and mining },
  journal = {Expert Systems with Applications },
  volume = {35},
  number = {3},
  pages = {581 - 590},
  year = {2008},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2007.07.015},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417407002709},
  author = {Yun Chen and Flora S. Tsai and Kap Luk Chan},
  keywords = {Latent semantic analysis},
  keywords = {Probabilistic latent semantic analysis},
  keywords = {Weblog},
  keywords = {Blog},
  keywords = {Data mining },
  abstract = {Weblogs, or blogs, have rapidly gained in popularity over the past few years. In particular, the growth of business blogs that are written by or provide commentary on businesses and companies opens up new opportunities for developing blog-specific search and mining techniques. In this paper, we propose probabilistic models for blog search and mining using two machine learning techniques, latent semantic analysis (LSA) and probabilistic latent semantic analysis (PLSA). We implement the models in our database of business blogs, BizBlogs07, with the aim of achieving higher precision and recall. The probabilistic model is able to segment the business blogs into separate topic areas, which is useful for keywords detection on the blogosphere. Various term-weighting schemes and factor values were also studied in detail, which reveal interesting patterns in our database of business blogs. Our multi-functional business blog system is indeed found to be very different from existing blog search engines, as it aims to provide better relevance and precision of the search. }
}
@article{Hu2014477,
  title = {Analysis of insidious fault activation and water inrush from the mining floor },
  journal = {International Journal of Mining Science and Technology },
  volume = {24},
  number = {4},
  pages = {477 - 483},
  year = {2014},
  note = {},
  issn = {2095-2686},
  doi = {http://dx.doi.org/10.1016/j.ijmst.2014.05.010},
  url = {http://www.sciencedirect.com/science/article/pii/S2095268614000834},
  author = {Xinyu Hu and Lianguo Wang and Yinlong Lu and Mei Yu},
  keywords = {Insidious fault},
  keywords = {Effective shear stress},
  keywords = {Stress intensity factor},
  keywords = {Fault activation},
  keywords = {Water inrush },
  abstract = {Abstract Based on the stress field distribution rule of the mining floor under abutment pressure, we have established a simplified mechanical model, which contains multiple factors relating to activation and evolution of insidious water-conductive faults. The influence of normal and shear stresses on fault activation and effective shear stress distribution in the fault plane was acquired under mining conditions. Using fracture mechanics theory to calculate the stress intensity factor of an insidious fault front, we have derived the criterion for main fault activation. Results indicate that during the whole working face advance, transpressions are exerted on fault planes twice successively in opposite directions. In most cases, the second transpression is more likely to lead to fault activation. Activation is influenced by many factors, predominant among which are: burial depth of the insidious fault, friction angle of the fault plane, face advance direction and pore water pressure. Steep fault planes are more easily activated to induce a sustained water inrush in the face. }
}
@article{Olafsson20081429,
  title = {Operations research and data mining },
  journal = {European Journal of Operational Research },
  volume = {187},
  number = {3},
  pages = {1429 - 1448},
  year = {2008},
  note = {},
  issn = {0377-2217},
  doi = {http://dx.doi.org/10.1016/j.ejor.2006.09.023},
  url = {http://www.sciencedirect.com/science/article/pii/S037722170600854X},
  author = {Sigurdur Olafsson and Xiaonan Li and Shuning Wu},
  keywords = {Data mining},
  keywords = {Optimization},
  keywords = {Classification},
  keywords = {Clustering},
  keywords = {Mathematical programming},
  keywords = {Heuristics },
  abstract = {With the rapid growth of databases in many modern enterprises data mining has become an increasingly important approach for data analysis. The operations research community has contributed significantly to this field, especially through the formulation and solution of numerous data mining problems as optimization problems, and several operations research applications can also be addressed using data mining methods. This paper provides a survey of the intersection of operations research and data mining. The primary goals of the paper are to illustrate the range of interactions between the two fields, present some detailed examples of important research work, and provide comprehensive references to other important work in the area. The paper thus looks at both the different optimization methods that can be used for data mining, as well as the data mining process itself and how operations research methods can be used in almost every step of this process. Promising directions for future research are also identified throughout the paper. Finally, the paper looks at some applications related to the area of management of electronic services, namely customer relationship management and personalization. }
}
@article{Chen20094075,
  title = {Using neural networks and data mining techniques for the financial distress prediction model },
  journal = {Expert Systems with Applications },
  volume = {36},
  number = {2, Part 2},
  pages = {4075 - 4086},
  year = {2009},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2008.03.020},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417408001954},
  author = {Wei-Sen Chen and Yin-Kuan Du},
  keywords = {Financial distress prediction model},
  keywords = {Artificial neural network},
  keywords = {Data mining },
  abstract = {The operating status of an enterprise is disclosed periodically in a financial statement. As a result, investors usually only get information about the financial distress a company may be in after the formal financial statement has been published. If company executives intentionally package financial statements with the purpose of hiding the actual status of the company, then investors will have even less chance of obtaining the real financial information. For example, a company can manipulate its current ratio by up to 200% so that its liquidity deficiency will not show up as a financial distress in the short run. To improve the accuracy of the financial distress prediction model, this paper adopted the operating rules of the Taiwan stock exchange corporation (TSEC) which were violated by those companies that were subsequently stopped and suspended, as the range of the analysis of this research. In addition, this paper also used financial ratios, other non-financial ratios, and factor analysis to extract adaptable variables. Moreover, the artificial neural network (ANN) and data mining (DM) techniques were used to construct the financial distress prediction model. The empirical experiment with a total of 37 ratios and 68 listed companies as the initial samples obtained a satisfactory result, which testifies for the feasibility and validity of our proposed methods for the financial distress prediction of listed companies. This paper makes four critical contributions: (1) The more factor analysis we used, the less accuracy we obtained by the \{ANN\} and \{DM\} approach. (2) The closer we get to the actual occurrence of financial distress, the higher the accuracy we obtain, with an 82.14% correct percentage for two seasons prior to the occurrence of financial distress. (3) Our empirical results show that factor analysis increases the error of classifying companies that are in a financial crisis as normal companies. (4) By developing a financial distress prediction model, the \{ANN\} approach obtains better prediction accuracy than the \{DM\} clustering approach. Therefore, this paper proposes that the artificial intelligent (AI) approach could be a more suitable methodology than traditional statistics for predicting the potential financial distress of a company. }
}
@article{Greco200874,
  title = {Mining taxonomies of process models },
  journal = {Data & Knowledge Engineering },
  volume = {67},
  number = {1},
  pages = {74 - 102},
  year = {2008},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2008.06.010},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X08000839},
  author = {Gianluigi Greco and Antonella Guzzo and Luigi Pontieri},
  keywords = {Process mining},
  keywords = {Abstraction},
  keywords = {Knowledge discovery},
  keywords = {Workflow management },
  abstract = {Process mining techniques have been receiving great attention in the literature for their ability to automatically support process (re)design. Typically, these techniques discover a concrete workflow schema modelling all possible execution patterns registered in a given log, which can be exploited subsequently to support further-coming enactments. In this paper, an approach to process mining is introduced that extends classical discovery mechanisms by means of an abstraction method aimed at producing a taxonomy of workflow models. The taxonomy is built to capture the process behavior at different levels of detail. Indeed, the most-detailed mined models, i.e., the leafs of the taxonomy, are meant to support the design of concrete workflows, as it happens with existing techniques in the literature. The other models, i.e., non-leaf nodes of the taxonomy, represent instead abstract views over the process behavior that can be used to support advanced monitoring and analysis tasks. All the techniques discussed in the paper have been implemented, tested, and made available as a plugin for a popular process mining framework (ProM). A series of tests, performed on different synthesized and real datasets, evidenced the capability of the approach to characterize the behavior encoded in input logs in a precise and complete way, achieving compelling conformance results even in the presence of complex behavior and noisy data. Moreover, encouraging results have been obtained in a real-life application scenario, where it is shown how the taxonomical view of the process can effectively support an explorative ex-post analysis, hinged on the different kinds of process execution discovered from the logs. }
}
@article{Chen20083214,
  title = {An information granulation based data mining approach for classifying imbalanced data },
  journal = {Information Sciences },
  volume = {178},
  number = {16},
  pages = {3214 - 3227},
  year = {2008},
  note = {Including Special Issue: Recent advances in granular computing Fifth Internation Conference on Machine Learning and Cybernetics },
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2008.03.018},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025508001059},
  author = {Mu-Chen Chen and Long-Sheng Chen and Chun-Chin Hsu and Wei-Rong Zeng},
  keywords = {Information granulation},
  keywords = {Granular computing},
  keywords = {Data mining},
  keywords = {Latent semantic indexing},
  keywords = {Imbalanced data},
  keywords = {Feed-forward neural network },
  abstract = {Recently, the class imbalance problem has attracted much attention from researchers in the field of data mining. When learning from imbalanced data in which most examples are labeled as one class and only few belong to another class, traditional data mining approaches do not have a good ability to predict the crucial minority instances. Unfortunately, many real world data sets like health examination, inspection, credit fraud detection, spam identification and text mining all are faced with this situation. In this study, we present a novel model called the “Information Granulation Based Data Mining Approach” to tackle this problem. The proposed methodology, which imitates the human ability to process information, acquires knowledge from Information Granules rather then from numerical data. This method also introduces a Latent Semantic Indexing based feature extraction tool by using Singular Value Decomposition, to dramatically reduce the data dimensions. In addition, several data sets from the \{UCI\} Machine Learning Repository are employed to demonstrate the effectiveness of our method. Experimental results show that our method can significantly increase the ability of classifying imbalanced data. }
}
@article{Liu2008877,
  title = {Toward supporting real-time mining for data residing on enterprise systems },
  journal = {Expert Systems with Applications },
  volume = {34},
  number = {2},
  pages = {877 - 888},
  year = {2008},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2006.10.033},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417406003496},
  author = {Yu-Chin Liu and Ping-Yu Hsu},
  keywords = {Frequent patterns},
  keywords = {Data mining},
  keywords = {Enterprise databases },
  abstract = {As data mining techniques are explored extensively, incorporating discovered knowledge into business strategies gives superior competitive advantage to corporations. Most techniques in mining association rules nowadays are designed to solve problems based on transaction files transformed to horizontal or vertical format. Namely, the transaction-normalized tables should be transformed before such methods could be applied, and some previous works have pointed out that such tasks of performing data transformation usually consume a lot of resources. As a result, traditionally, data mining technique has seldom being applied in real-time. However, in many cases, the decisions have to be made in a short time, such as the decisions of promoting fresh agriculture goods in retailing stores should be made daily and in the limit of one or two hours. This study therefore proposes a new method which incorporates mining algorithms with enterprise transaction databases directly to perform real-time mining. In addition, the proposed method has following advantages to support real-time mining performed in enterprise systems: • raw data of enterprise systems are used directly, • when the threshold is tuned, only newly qualified data are read and the data structure built for original data is kept intact, • product assortments centered on particular product can be effectively performed, • the performance of the mining algorithm is better than that of popular mining algorithms. }
}
@article{Vandecruys2008823,
  title = {Mining software repositories for comprehensible software fault prediction models },
  journal = {Journal of Systems and Software },
  volume = {81},
  number = {5},
  pages = {823 - 839},
  year = {2008},
  note = {Software Process and Product Measurement },
  issn = {0164-1212},
  doi = {http://dx.doi.org/10.1016/j.jss.2007.07.034},
  url = {http://www.sciencedirect.com/science/article/pii/S0164121207001902},
  author = {Olivier Vandecruys and David Martens and Bart Baesens and Christophe Mues and Manu De Backer and Raf Haesen},
  keywords = {Classification},
  keywords = {Software mining},
  keywords = {Fault prediction},
  keywords = {Comprehensibility},
  keywords = {Ant Colony Optimization },
  abstract = {Software managers are routinely confronted with software projects that contain errors or inconsistencies and exceed budget and time limits. By mining software repositories with comprehensible data mining techniques, predictive models can be induced that offer software managers the insights they need to tackle these quality and budgeting problems in an efficient way. This paper deals with the role that the Ant Colony Optimization (ACO)-based classification technique AntMiner+ can play as a comprehensible data mining technique to predict erroneous software modules. In an empirical comparison on three real-world public datasets, the rule-based models produced by AntMiner+ are shown to achieve a predictive accuracy that is competitive to that of the models induced by several other included classification techniques, such as C4.5, logistic regression and support vector machines. In addition, we will argue that the intuitiveness and comprehensibility of the AntMiner+ models can be considered superior to the latter models. }
}
@article{Exarchos2008467,
  title = {A two-stage methodology for sequence classification based on sequential pattern mining and optimization },
  journal = {Data & Knowledge Engineering },
  volume = {66},
  number = {3},
  pages = {467 - 487},
  year = {2008},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2008.05.007},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X08000748},
  author = {Themis P. Exarchos and Markos G. Tsipouras and Costas Papaloukas and Dimitrios I. Fotiadis},
  keywords = {Sequential pattern mining},
  keywords = {Sequential pattern matching},
  keywords = {Sequence classification },
  abstract = {We present a methodology for sequence classification, which employs sequential pattern mining and optimization, in a two-stage process. In the first stage, a sequence classification model is defined, based on a set of sequential patterns and two sets of weights are introduced, one for the patterns and one for classes. In the second stage, an optimization technique is employed to estimate the weight values and achieve optimal classification accuracy. Extensive evaluation of the methodology is carried out, by varying the number of sequences, the number of patterns and the number of classes and it is compared with similar sequence classification approaches. }
}
@article{Ting2014200,
  title = {Mining logistics data to assure the quality in a sustainable food supply chain: A case in the red wine industry },
  journal = {International Journal of Production Economics },
  volume = {152},
  number = {0},
  pages = {200 - 209},
  year = {2014},
  note = {Sustainable Food Supply Chain Management },
  issn = {0925-5273},
  doi = {http://dx.doi.org/10.1016/j.ijpe.2013.12.010},
  url = {http://www.sciencedirect.com/science/article/pii/S0925527313005690},
  author = {S.L. Ting and Y.K. Tse and G.T.S. Ho and S.H. Chung and G. Pang},
  keywords = {Quality sustainability},
  keywords = {Supply chain quality},
  keywords = {Wine industry},
  keywords = {Association rule },
  abstract = {Abstract In recent years, food supply chains have faced increased quality risk, caused by the extended global supply chain and increased consumer demands on quality and safety. Given the concern regarding quality sustainability in the food supply chain, much attention is being paid to continuous planning and monitoring of quality assurance practices in the supply chain network. In this research, we propose a supply chain quality sustainability decision support system (QSDSS), adopting association rule mining and Dempster's rule of combination techniques. The aim of \{QSDSS\} is to support managers in food manufacturing firms to define good logistics plans in order to maintain the quality and safety of food products. We conduct a case study of a Hong Kong red wine company in order to illustrate the applicability and effectiveness of QSDSS. Implications of the proposed approach are discussed, and suggestions for future work are outlined. }
}
@article{Wang20095900,
  title = {Evolutionary-based feature selection approaches with new criteria for data mining: A case study of credit approval data },
  journal = {Expert Systems with Applications },
  volume = {36},
  number = {3, Part 2},
  pages = {5900 - 5908},
  year = {2009},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2008.07.026},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417408004454},
  author = {Chia-Ming Wang and Yin-Fu Huang},
  keywords = {Data Mining},
  keywords = {Evolutionary algorithm},
  keywords = {Feature selection},
  keywords = {Multi-objective optimization },
  abstract = {In this paper, the feature selection problem was formulated as a multi-objective optimization problem, and new criteria were proposed to fulfill the goal. Foremost, data were pre-processed with missing value replacement scheme, re-sampling procedure, data type transformation procedure, and min-max normalization procedure. After that a wide variety of classifiers and feature selection methods were conducted and evaluated. Finally, the paper presented comprehensive experiments to show the relative performance of the classification tasks. The experimental results revealed the success of proposed methods in credit approval data. In addition, the numeric results also provide guides in selection of feature selection methods and classifiers in the knowledge discovery process. }
}
@article{Stankovski2008259,
  title = {Grid-enabling data mining applications with DataMiningGrid: An architectural perspective },
  journal = {Future Generation Computer Systems },
  volume = {24},
  number = {4},
  pages = {259 - 279},
  year = {2008},
  note = {},
  issn = {0167-739X},
  doi = {http://dx.doi.org/10.1016/j.future.2007.05.004},
  url = {http://www.sciencedirect.com/science/article/pii/S0167739X07000933},
  author = {Vlado Stankovski and Martin Swain and Valentin Kravtsov and Thomas Niessen and Dennis Wegener and Jörg Kindermann and Werner Dubitzky},
  keywords = {Data mining},
  keywords = {Distributed systems},
  keywords = {Web-based services },
  abstract = {The DataMiningGrid system has been designed to meet the requirements of modern and distributed data mining scenarios. Based on the Globus Toolkit and other open technology and standards, the DataMiningGrid system provides tools and services facilitating the grid-enabling of data mining applications without any intervention on the application side. Critical features of the system include flexibility, extensibility, scalability, efficiency, conceptual simplicity and ease of use. The system has been developed and evaluated on the basis of a diverse set of use cases from different sectors in science and technology. The DataMiningGrid software is freely available under Apache License 2.0. }
}
@article{Iqbal2008S42,
  title = {A novel approach of mining write-prints for authorship attribution in e-mail forensics },
  journal = {Digital Investigation },
  volume = {5, Supplement},
  number = {0},
  pages = {S42 - S51},
  year = {2008},
  note = {The Proceedings of the Eighth Annual \{DFRWS\} Conference },
  issn = {1742-2876},
  doi = {http://dx.doi.org/10.1016/j.diin.2008.05.001},
  url = {http://www.sciencedirect.com/science/article/pii/S1742287608000315},
  author = {Farkhund Iqbal and Rachid Hadjidj and Benjamin C.M. Fung and Mourad Debbabi},
  keywords = {E-mail forensic analysis},
  keywords = {Authorship identification},
  keywords = {Data mining},
  keywords = {Write-print},
  keywords = {Frequent itemsets },
  abstract = {There is an alarming increase in the number of cybercrime incidents through anonymous e-mails. The problem of e-mail authorship attribution is to identify the most plausible author of an anonymous e-mail from a group of potential suspects. Most previous contributions employed a traditional classification approach, such as decision tree and Support Vector Machine (SVM), to identify the author and studied the effects of different writing style features on the classification accuracy. However, little attention has been given on ensuring the quality of the evidence. In this paper, we introduce an innovative data mining method to capture the write-print of every suspect and model it as combinations of features that occurred frequently in the suspect's e-mails. This notion is called frequent pattern, which has proven to be effective in many data mining applications, but it is the first time to be applied to the problem of authorship attribution. Unlike the traditional approach, the extracted write-print by our method is unique among the suspects and, therefore, provides convincing and credible evidence for presenting it in a court of law. Experiments on real-life e-mails suggest that the proposed method can effectively identify the author and the results are supported by a strong evidence. }
}
@article{Lopes2007316,
  title = {Visual text mining using association rules },
  journal = {Computers & Graphics },
  volume = {31},
  number = {3},
  pages = {316 - 326},
  year = {2007},
  note = {},
  issn = {0097-8493},
  doi = {http://dx.doi.org/10.1016/j.cag.2007.01.023},
  url = {http://www.sciencedirect.com/science/article/pii/S0097849307000544},
  author = {A.A. Lopes and R. Pinho and F.V. Paulovich and R. Minghim},
  keywords = {Visual text mining},
  keywords = {Association rules},
  keywords = {Data mining},
  keywords = {Information visualization },
  abstract = {In many situations, individuals or groups of individuals are faced with the need to examine sets of documents to achieve understanding of their structure and to locate relevant information. In that context, this paper presents a framework for visual text mining to support exploration of both general structure and relevant topics within a textual document collection. Our approach starts by building a visualization from the text data set. On top of that, a novel technique is presented that generates and filters association rules to detect and display topics from a group of documents. Results have shown a very consistent match between topics extracted using this approach to those actually present in the data set. }
}
@article{Holton2009853,
  title = {Identifying disgruntled employee systems fraud risk through text mining: A simple solution for a multi-billion dollar problem },
  journal = {Decision Support Systems },
  volume = {46},
  number = {4},
  pages = {853 - 864},
  year = {2009},
  note = {\{IT\} Decisions in Organizations },
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/j.dss.2008.11.013},
  url = {http://www.sciencedirect.com/science/article/pii/S0167923608002078},
  author = {Carolyn Holton},
  keywords = {\{IS\} security},
  keywords = {Occupational fraud},
  keywords = {Text mining},
  keywords = {Design science},
  keywords = {Disgruntled employee},
  keywords = {Organizational communication },
  abstract = {Occupational fraud is a $652 billion problem to which disgruntled employees are a major contributor. Much security research addresses reducing fraud opportunity and increasing fraud detection, but detecting motivational factors like employee disgruntlement is less studied. The Sarbanes–Oxley Act requires that companies archive email, creating an untapped resource for deterring fraud. Herein, protocols to identify disgruntled communications are developed. Messages cluster well according to disgruntled content, giving confidence in the value of email for this task. A highly accurate naïve Bayes model predicts whether messages contain disgruntled communications, providing extremely relevant information not otherwise likely to be revealed in a fraud audit. The model can be incorporated into fraud risk analysis systems to improve their ability to detect and deter fraud. }
}
@article{Zhang20081326,
  title = {Mining search engine query logs for social filtering-based query recommendation },
  journal = {Applied Soft Computing },
  volume = {8},
  number = {4},
  pages = {1326 - 1334},
  year = {2008},
  note = {Soft Computing for Dynamic Data Mining },
  issn = {1568-4946},
  doi = {http://dx.doi.org/10.1016/j.asoc.2007.11.004},
  url = {http://www.sciencedirect.com/science/article/pii/S1568494608000458},
  author = {Zhiyong Zhang and Olfa Nasraoui},
  keywords = {Query log},
  keywords = {Social filtering},
  keywords = {Web mining},
  keywords = {Recommendation },
  abstract = {This paper presents a simple and intuitive method for mining search engine query logs for fast social filtering, where searchers are provided with dynamic query recommendations on a large-scale industrial-strength search engine. We adopt a dynamic approach that is able to absorb new and recent trends in web usage trends on search engines, while forgetting outdated trends, thus adapting to dynamic changes in web user’s interests. In order to get well-rounded recommendations, we combine two methods: first, we model search engine users’ sequential search behavior, and interpret this consecutive search behavior as client-side query refinement, that should form the basis for the search engine’s own query refinement process. This query refinement process is exploited to learn useful information that helps generate related queries. Second, we combine this method with a traditional text or content based similarity method to compensate for the shortness of query sessions and sparsity of real query log data. }
}
@article{vanderAalst2004231,
  title = {Process mining: a research agenda },
  journal = {Computers in Industry },
  volume = {53},
  number = {3},
  pages = {231 - 244},
  year = {2004},
  note = {Process / Workflow Mining },
  issn = {0166-3615},
  doi = {http://dx.doi.org/10.1016/j.compind.2003.10.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0166361503001945},
  author = {W.M.P. van der Aalst and A.J.M.M. Weijters},
  keywords = {Process mining},
  keywords = {Workflow mining},
  keywords = {Workflow management},
  keywords = {Data mining},
  keywords = {Petri nets },
  abstract = {Enterprise information systems support and control operational business processes ranging from simple internal back-office processes to complex interorganizational processes. Technologies such as workflow management (WFM), enterprise application integration (EAI), enterprise resource planning (ERP), and web services (WS) typically focus on the realization of \{IT\} support rather than monitoring the operational business processes. Process mining aims at extracting information from event logs to capture the business process as it is being executed. In this paper, we put the topic of process mining into context, discuss the main issues around process mining, and finally we introduce the papers in this special issue. }
}
@article{Congiusta20083,
  title = {Service-oriented middleware for distributed data mining on the grid },
  journal = {Journal of Parallel and Distributed Computing },
  volume = {68},
  number = {1},
  pages = {3 - 15},
  year = {2008},
  note = {Parallel Techniques for Information Extraction },
  issn = {0743-7315},
  doi = {http://dx.doi.org/10.1016/j.jpdc.2007.07.007},
  url = {http://www.sciencedirect.com/science/article/pii/S0743731507001438},
  author = {Antonio Congiusta and Domenico Talia and Paolo Trunfio},
  keywords = {Distributed data mining},
  keywords = {Grid computing},
  keywords = {Grid services},
  keywords = {\{WSRF\} },
  abstract = {Distribution of data and computation allows for solving larger problems and executing applications that are distributed in nature. The grid is a distributed computing infrastructure that enables coordinated resource sharing within dynamic organizations consisting of individuals, institutions, and resources. The grid extends the distributed and parallel computing paradigms allowing for resource negotiation and dynamical allocation, heterogeneity, open protocols, and services. Grid environments can be used both for compute-intensive tasks and data intensive applications by exploiting their resources, services, and data access mechanisms. Data mining algorithms and knowledge discovery processes are both compute and data intensive, therefore the grid can offer a computing and data management infrastructure for supporting decentralized and parallel data analysis. This paper discusses how grid computing can be used to support distributed data mining. Research activities in grid-based data mining and some challenges in this area are presented along with some promising future directions for developing grid-based distributed data mining. }
}
@article{Romero2007135,
  title = {Educational data mining: A survey from 1995 to 2005 },
  journal = {Expert Systems with Applications },
  volume = {33},
  number = {1},
  pages = {135 - 146},
  year = {2007},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2006.04.005},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417406001266},
  author = {C. Romero and S. Ventura},
  keywords = {Data mining},
  keywords = {Educational systems},
  keywords = {Web mining},
  keywords = {Web-based educational systems },
  abstract = {Currently there is an increasing interest in data mining and educational systems, making educational data mining as a new growing research community. This paper surveys the application of data mining to traditional educational systems, particular web-based courses, well-known learning content management systems, and adaptive and intelligent web-based educational systems. Each of these systems has different data source and objectives for knowledge discovering. After preprocessing the available data in each case, data mining techniques can be applied: statistics and visualization; clustering, classification and outlier detection; association rule mining and pattern mining; and text mining. The success of the plentiful work needs much more specialized work in order for educational data mining to become a mature area. }
}
@article{Lin20141,
  title = {Mining \{GPS\} data for mobility patterns: A survey },
  journal = {Pervasive and Mobile Computing },
  volume = {12},
  number = {0},
  pages = {1 - 16},
  year = {2014},
  note = {},
  issn = {1574-1192},
  doi = {http://dx.doi.org/10.1016/j.pmcj.2013.06.005},
  url = {http://www.sciencedirect.com/science/article/pii/S1574119213000825},
  author = {Miao Lin and Wen-Jing Hsu},
  keywords = {Mobility pattern},
  keywords = {\{GPS\} data},
  keywords = {Ubiquitous computing},
  keywords = {Mobile computing },
  abstract = {Abstract With the help of various positioning tools, individuals’ mobility behaviors are being continuously captured from mobile phones, wireless networking devices and \{GPS\} appliances. These mobility data serve as an important foundation for understanding individuals’ mobility behaviors. For instance, recent studies show that, despite the dissimilarity in the mobility areas covered by individuals, there is high regularity in the human mobility behaviors, suggesting that most individuals follow a simple and reproducible pattern. This survey paper reviews relevant results on uncovering mobility patterns from \{GPS\} datasets. Specially, it covers the results about inferring locations of significance for prediction of future moves, detecting modes of transport, mining trajectory patterns and recognizing location-based activities. The survey provides a general perspective for studies on the issues of individuals’ mobility by reviewing the methods and algorithms in detail and comparing the existing results on the same issues. Several new and emergent issues concerning individuals’ mobility are proposed for further research. }
}
@article{Azadeh20082165,
  title = {Improved estimation of electricity demand function by integration of fuzzy system and data mining approach },
  journal = {Energy Conversion and Management },
  volume = {49},
  number = {8},
  pages = {2165 - 2177},
  year = {2008},
  note = {},
  issn = {0196-8904},
  doi = {http://dx.doi.org/10.1016/j.enconman.2008.02.021},
  url = {http://www.sciencedirect.com/science/article/pii/S0196890408000745},
  author = {A. Azadeh and M. Saberi and S.F. Ghaderi and A. Gitiforouz and V. Ebrahimipour},
  keywords = {Fuzzy system},
  keywords = {Data mining},
  keywords = {Forecasting},
  keywords = {Preprocessing},
  keywords = {Time series},
  keywords = {Electricity consumption },
  abstract = {This study presents an integrated fuzzy system, data mining and time series framework to estimate and predict electricity demand for seasonal and monthly changes in electricity consumption especially in developing countries such as China and Iran with non-stationary data. Furthermore, it is difficult to model uncertain behavior of energy consumption with only conventional fuzzy system or time series and the integrated algorithm could be an ideal substitute for such cases. To construct fuzzy systems, a rule base is needed. Because a rule base is not available, for the case of demand function, look up table which is one of the extracting rule methods is used to extract the rule base. This system is defined as FLT. Also, decision tree method which is a data mining approach is similarly utilized to extract the rule base. This system is defined as FDM. Preferred time series model is selected from linear (ARMA) and nonlinear model. For this, after selecting preferred \{ARMA\} model, McLeod–Li test is applied to determine nonlinearity condition. When, nonlinearity condition is satisfied, preferred nonlinear model is selected and compare with preferred \{ARMA\} model and finally one of this is selected as time series model. At last, \{ANOVA\} is used for selecting preferred model from fuzzy models and time series model. Also, the impact of data preprocessing and postprocessing on the fuzzy system performance is considered by the algorithm. In addition, another unique feature of the proposed algorithm is utilization of autocorrelation function (ACF) to define input variables, whereas conventional methods which use trial and error method. Monthly electricity consumption of Iran from 1995 to 2005 is considered as the case of this study. The \{MAPE\} estimation of genetic algorithm (GA), artificial neural network (ANN) versus the proposed algorithm shows the appropriateness of the proposed algorithm. }
}
@article{Kucukyilmaz20081448,
  title = {Chat mining: Predicting user and message attributes in computer-mediated communication },
  journal = {Information Processing & Management },
  volume = {44},
  number = {4},
  pages = {1448 - 1466},
  year = {2008},
  note = {},
  issn = {0306-4573},
  doi = {http://dx.doi.org/10.1016/j.ipm.2007.12.009},
  url = {http://www.sciencedirect.com/science/article/pii/S0306457308000046},
  author = {Tayfun Kucukyilmaz and B. Barla Cambazoglu and Cevdet Aykanat and Fazli Can},
  keywords = {Authorship analysis},
  keywords = {Chat mining},
  keywords = {Computer-mediated communication},
  keywords = {Machine learning},
  keywords = {Stylistics},
  keywords = {Text classification },
  abstract = {The focus of this paper is to investigate the possibility of predicting several user and message attributes in text-based, real-time, online messaging services. For this purpose, a large collection of chat messages is examined. The applicability of various supervised classification techniques for extracting information from the chat messages is evaluated. Two competing models are used for defining the chat mining problem. A term-based approach is used to investigate the user and message attributes in the context of vocabulary use while a style-based approach is used to examine the chat messages according to the variations in the authors’ writing styles. Among 100 authors, the identity of an author is correctly predicted with 99.7% accuracy. Moreover, the reverse problem is exploited, and the effect of author attributes on computer-mediated communications is discussed. }
}
@article{Hsia2008596,
  title = {Course planning of extension education to meet market demand by using data mining techniques – an example of Chinkuo technology university in Taiwan },
  journal = {Expert Systems with Applications },
  volume = {34},
  number = {1},
  pages = {596 - 602},
  year = {2008},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2006.09.025},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417406003095},
  author = {Tai-Chang Hsia and An-Jin Shie and Li-Chen Chen},
  keywords = {Data mining},
  keywords = {Extension education},
  keywords = {Decision tree algorithm},
  keywords = {Link analysis algorithms},
  keywords = {Decision forest algorithm },
  abstract = {This study used data mining techniques to analyze the course preferences and course completion rates of enrollees in extension education courses at a university in Taiwan. First, extension courses were classified into five broad groups. Records of enrollees in extension courses from 2000-5 were then analyzed by three data mining algorithms: Decision Tree, Link Analysis, and Decision Forest. Decision tree was used to find enrollee course preferences, Link Analysis found the correlation between course category and enrollee profession, and Decision Forest found the probability of enrollees completing preferred courses. Results will be used as a reference for curriculum development in the extension program. }
}
@article{Yun200786,
  title = {Mining lossless closed frequent patterns with weight constraints },
  journal = {Knowledge-Based Systems },
  volume = {20},
  number = {1},
  pages = {86 - 97},
  year = {2007},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2006.07.007},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705106001262},
  author = {Unil Yun},
  keywords = {Knowledge extraction},
  keywords = {Data mining},
  keywords = {Weighted frequent pattern mining},
  keywords = {Closed pattern mining },
  abstract = {Frequent pattern mining is one of main concerns in data mining tasks. In frequent pattern mining, closed frequent pattern mining and weighted frequent pattern mining are two main approaches to reduce the search space. Although many related studies have been suggested, no mining algorithm considers both paradigms. Even if closed frequent pattern mining represents exactly the same knowledge and weighted frequent pattern mining provides a way to discover more important patterns, the incorporation of closed frequent pattern mining and weight frequent pattern mining may loss information. Based on our analysis of joining orders, we propose closed weighted frequent pattern mining, and present how to discover succinct but lossless closed frequent pattern with weight constraints. To our knowledge, ours is the first work specifically to consider both constraints. An extensive performance study shows that our algorithm outperforms previous algorithms. In addition, it is efficient and scalable. }
}
@article{Mutihac20081,
  title = {Mining in chemometrics },
  journal = {Analytica Chimica Acta },
  volume = {612},
  number = {1},
  pages = {1 - 18},
  year = {2008},
  note = {},
  issn = {0003-2670},
  doi = {http://dx.doi.org/10.1016/j.aca.2008.02.025},
  url = {http://www.sciencedirect.com/science/article/pii/S0003267008003450},
  author = {Lucia Mutihac and Radu Mutihac},
  keywords = {Chemometrics},
  keywords = {Data mining},
  keywords = {Exploratory analysis},
  keywords = {Pattern recognition},
  keywords = {Artificial neural networks},
  keywords = {Inferential statistics},
  keywords = {Hypothesis-driven methods},
  keywords = {Data-driven methods },
  abstract = {Some of the increasingly spread data mining methods in chemometrics like exploratory data analysis, artificial neural networks, pattern recognition, and digital image processing with their highs and lows along with some of their representative applications are discussed. The development of more complex analytical instruments and the need to cope with larger experimental data sets have demanded for new approaches in data analysis, which have led to advanced methods in experimental design and data processing. Hypothesis-driven methods typified by inferential statistics have been gradually complemented or even replaced by data-driven model-free methods that seek for structure in data without reference to the experimental protocol or prior hypotheses. The emphasis is put on the ability of data mining methods to solve multivariate–multiresponse problems on the basis of experimental data and minimal statistical assumptions only, in contrast to classical methods, which require predefined priors to be tested against some null-hypothesis. }
}
@article{Wang20081707,
  title = {Mining knowledge from natural language texts using fuzzy associated concept mapping },
  journal = {Information Processing & Management },
  volume = {44},
  number = {5},
  pages = {1707 - 1719},
  year = {2008},
  note = {},
  issn = {0306-4573},
  doi = {http://dx.doi.org/10.1016/j.ipm.2008.05.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0306457308000526},
  author = {W.M. Wang and C.F. Cheung and W.B. Lee and S.K. Kwok},
  keywords = {Concept mapping},
  keywords = {Natural language processing},
  keywords = {Knowledge mining},
  keywords = {Information retrieval},
  keywords = {Fuzzy set theory},
  keywords = {Knowledge management },
  abstract = {Natural Language Processing (NLP) techniques have been successfully used to automatically extract information from unstructured text through a detailed analysis of their content, often to satisfy particular information needs. In this paper, an automatic concept map construction technique, Fuzzy Association Concept Mapping (FACM), is proposed for the conversion of abstracted short texts into concept maps. The approach consists of a linguistic module and a recommendation module. The linguistic module is a text mining method that does not require the use to have any prior knowledge about using \{NLP\} techniques. It incorporates rule-based reasoning (RBR) and case based reasoning (CBR) for anaphoric resolution. It aims at extracting the propositions in text so as to construct a concept map automatically. The recommendation module is arrived at by adopting fuzzy set theories. It is an interactive process which provides suggestions of propositions for further human refinement of the automatically generated concept maps. The suggested propositions are relationships among the concepts which are not explicitly found in the paragraphs. This technique helps to stimulate individual reflection and generate new knowledge. Evaluation was carried out by using the Science Citation Index (SCI) abstract database and \{CNET\} News as test data, which are well known databases and the quality of the text is assured. Experimental results show that the automatically generated concept maps conform to the outputs generated manually by domain experts, since the degree of difference between them is proportionally small. The method provides users with the ability to convert scientific and short texts into a structured format which can be easily processed by computer. Moreover, it provides knowledge workers with extra time to re-think their written text and to view their knowledge from another angle. }
}
@incollection{Gibert2008205,
  title = {Chapter Twelve Data Mining for Environmental Systems },
  editor = {A.J. Jakeman, A.A. Voinov, A.E. Rizzoli and S.H. Chen},
  booktitle = {Environmental Modelling, Software and Decision Support},
  publisher = {Elsevier},
  year = {2008},
  volume = {3},
  pages = {205 - 228},
  series = {Developments in Integrated Environmental Assessment },
  issn = {1574-101X},
  doi = {http://dx.doi.org/10.1016/S1574-101X(08)00612-1},
  url = {http://www.sciencedirect.com/science/article/pii/S1574101X08006121},
  author = {K. Gibert and J. Spate and M. Sànchez-Marrè and Ioannis N. Athanasiadis and J. Comas},
  keywords = {data mining},
  keywords = {knowledge discovery of data},
  keywords = {multidisciplinarity},
  keywords = {environmental systems },
  abstract = {Over recent years a huge library of data mining algorithms has been developed to tackle a variety of problems in fields such as medical imaging and network traffic analysis. Many of these techniques are far more flexible than more classical modelling approaches and could be usefully applied to data-rich environmental problems. Certain techniques such as Artificial Neural Networks, Clustering, Case-Based Reasoning and more recently Bayesian Decision Networks have found application in environmental modelling while other methods, for example classification and association rule extraction, have not yet been taken up on any wide scale. We propose that these and other data mining techniques could be usefully applied to difficult problems in the field. The chapter is a general introduction to Data Mining techniques for Environmental Scientists who may be interested in using them in their applications. So the presentation focuses on the contributions of data mining techniques to environmental applications and on general guidelines of good practice in real world domains. The purpose of this chapter is not to present technical details on specific data mining techniques, but rather to provide general guidance to non-expert users to help them decide which technique is appropriate for solving their problem. References to the wider literature are provided. }
}
@article{Brooks20081,
  title = {Shifting the focus of strategic occupational injury prevention: Mining free-text, workers compensation claims data },
  journal = {Safety Science },
  volume = {46},
  number = {1},
  pages = {1 - 21},
  year = {2008},
  note = {},
  issn = {0925-7535},
  doi = {http://dx.doi.org/10.1016/j.ssci.2006.09.006},
  url = {http://www.sciencedirect.com/science/article/pii/S0925753506001299},
  author = {Benjamin Brooks},
  keywords = {Text-mining},
  keywords = {Workers compensation},
  keywords = {Car and delivery drivers},
  keywords = {Wood industry},
  keywords = {Injury prevention },
  abstract = {The current analysis applies the software SAS® Text Miner to the mining of the free-text component of workers compensation claims data. It describes these methods with reference to claims within the state of Victoria, Australia from 1992 to 2002 for two groups called ‘Car and Delivery Drivers’ and ‘Wood Industry Workers’. It suggests that text mining can be used as a stand-alone tool for the analysis of free-text descriptions of occupational injury or combined with coded data to investigate sub-categories of injury. By manipulating the original description of the injury, text-mining can offer more detailed analyses of certain types of claim and shifts the focus of strategic prevention from broad statistics or coded groups of information to the descriptions people use to explain their injuries and accidents. }
}
@article{Bauer2008415,
  title = {Mining data, gathering variables and recombining information: the flexible architecture of epidemiological studies },
  journal = {Studies in History and Philosophy of Science Part C: Studies in History and Philosophy of Biological and Biomedical Sciences },
  volume = {39},
  number = {4},
  pages = {415 - 428},
  year = {2008},
  note = {},
  issn = {1369-8486},
  doi = {http://dx.doi.org/10.1016/j.shpsc.2008.09.008},
  url = {http://www.sciencedirect.com/science/article/pii/S1369848608000721},
  author = {Susanne Bauer},
  keywords = {Risk factor epidemiology},
  keywords = {Breast cancer aetiology},
  keywords = {Collecting},
  keywords = {Data mining},
  keywords = {Biobanks},
  keywords = {Population registries },
  abstract = {Since the second half of the twentieth century, biomedical research has made increasing use of epidemiological methods to establish empirical evidence on a population level. This paper is about practices with data in epidemiological research, based on a case study in Denmark. I propose an epistemology of record linkage that invites exploration of epidemiological studies as heterogeneous assemblages. Focusing on data collecting, sampling and linkage, I examine how data organisation and processing become productive beyond the context of their collection. The case study looks at how a local population database established in 1976 to investigate possibilities for the prevention of cardiovascular disease is used thirty years later to test hypotheses on the aetiology of breast cancer. For two breast cancer investigations based on the same core data set, I follow the underlying record linkage practice and describe how research objects such as molecular markers become relevant with respect to public health through information networking. Epidemiological association studies function as tools that performatively enrol different contexts into statistical risk estimation, thereby configuring options for research as well as for clinical testing and public health policy. }
}
@article{Kuo2007794,
  title = {Mining association rules through integration of clustering analysis and ant colony system for health insurance database in Taiwan },
  journal = {Expert Systems with Applications },
  volume = {33},
  number = {3},
  pages = {794 - 808},
  year = {2007},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2006.08.035},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417406002387},
  author = {R.J. Kuo and S.Y. Lin and C.W. Shih},
  keywords = {Data mining},
  keywords = {Ant colony system},
  keywords = {Cluster},
  keywords = {Association rule },
  abstract = {In addition to sharing and applying the knowledge in the community, knowledge discovery has become an important issue in the knowledge economic era. Data mining plays an important role of knowledge discovery. Therefore, this study intends to propose a novel framework of data mining which clusters the data first and then followed by association rules mining. The first stage employs the ant system-based clustering algorithm (ASCA) and ant K-means (AK) to cluster the database, while the ant colony system-based association rules mining algorithm is applied to discover the useful rules for each group. The medical database provided by the National Health Insurance Bureau of Taiwan Government is used to verify the proposed method. The evaluation results showed that the proposed method not only is able to extract the rules much faster, but also can discover more important rules. }
}
@article{Liao200850,
  title = {Mining product maps for new product development },
  journal = {Expert Systems with Applications },
  volume = {34},
  number = {1},
  pages = {50 - 62},
  year = {2008},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2006.08.027},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417406002545},
  author = {Shu-Hsien Liao and Chia-Lin Hsieh and Sui-Ping Huang},
  keywords = {New product development},
  keywords = {Product map},
  keywords = {Data mining},
  keywords = {Association rules},
  keywords = {Knowledge extraction },
  abstract = {Many enterprises have been devoting a significant portion of their budget to new product development (NPD) in order to distinguish their products from those of their competitors, and to make them better fit the needs and wants of customers. Hence, businesses should develop products that fulfill the customer demands, since this will increase the enterprise’s competitiveness and it is an essential criterion to earning higher loyalties and profits. This paper presents the product map obtained from data mining results, which investigates the relationships among customer demands, product characteristics, and transaction records, using the Apriori algorithm as a methodology of association rules for data mining. The product map shows that different knowledge patterns and rules can be extracted from customers to develop new cosmetic products and possible marketing solutions. Accordingly, this paper suggests that the cosmetics industry should extract customer knowledge from the demand side and use this as a knowledge resource on its supply chain for new product development. }
}
@article{Novák20084,
  title = {Mining pure linguistic associations from numerical data },
  journal = {International Journal of Approximate Reasoning },
  volume = {48},
  number = {1},
  pages = {4 - 22},
  year = {2008},
  note = {Special Section: Perception Based Data Mining and Decision Support Systems },
  issn = {0888-613X},
  doi = {http://dx.doi.org/10.1016/j.ijar.2007.06.005},
  url = {http://www.sciencedirect.com/science/article/pii/S0888613X0700076X},
  author = {Vilém Novák and Irina Perfilieva and Antonín Dvořák and Guoqing Chen and Qiang Wei and Peng Yan},
  keywords = {Evaluative linguistic expressions},
  keywords = {\{GUHA\} method},
  keywords = {Linguistic associations},
  keywords = {Data mining},
  keywords = {Association rules },
  abstract = {This paper contains a method for direct search of associations from numerical data that are expressed in natural language and so, we call them “linguistic associations”. The associations are composed of evaluative linguistic expressions, for example “small, very big, roughly medium”, etc. The main idea is to evaluate real-valued data by the corresponding linguistic expressions and then search for associations using some of the standard data-mining technique (we have used the \{GUHA\} method). One of essential outcomes of our theory is high understandability of the found associations because when formulated in natural language they are much closer to the way of thinking of experts from various fields. Moreover, associations characterizing real dependencies can be directly taken as fuzzy IF–THEN rules and used as expert knowledge about the problem. }
}
@article{Gobster201421,
  title = {(Text) Mining the LANDscape: Themes and trends over 40 years of Landscape and Urban Planning },
  journal = {Landscape and Urban Planning },
  volume = {126},
  number = {0},
  pages = {21 - 30},
  year = {2014},
  note = {},
  issn = {0169-2046},
  doi = {http://dx.doi.org/10.1016/j.landurbplan.2014.02.025},
  url = {http://www.sciencedirect.com/science/article/pii/S0169204614000668},
  author = {Paul H. Gobster},
  keywords = {Text analysis},
  keywords = {\{VOSviewer\}},
  keywords = {Cluster analysis},
  keywords = {Visualization},
  keywords = {Research trends},
  keywords = {Knowledge paradigms },
  abstract = {Abstract In commemoration of the journal's 40th anniversary, the co-editor explores themes and trends covered by Landscape and Urban Planning and its parent journals through a qualitative comparison of co-occurrence term maps generated from the text corpora of its abstracts across the four decadal periods of publication. Cluster maps generated from the \{VOSviewer\} program reveal a coalescence of concepts for the last two decades along three knowledge domains: human dimensions, landscape analysis and planning, and urban ecology. Citation impact “heat maps” offer additional clues about emerging and high-impact topics. The editor assesses these findings with respect to the journal's aims and scope and offers some thoughts on future directions for research. }
}
@article{Wang2007425,
  title = {Mining key information of web pages: A method and its application },
  journal = {Expert Systems with Applications },
  volume = {33},
  number = {2},
  pages = {425 - 433},
  year = {2007},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2006.05.017},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417406001588},
  author = {Chao Wang and Jie Lu and Guangquan Zhang},
  keywords = {Web content mining},
  keywords = {Web page},
  keywords = {Key information},
  keywords = {Entropy},
  keywords = {Taxonomy},
  keywords = {Ontology generation },
  abstract = {Web content mining aims to discover useful information and generate desired knowledge from a large amount of web pages. Key information, such as distinctive menu items, navigation indicators, which is embedded in web pages, can help classify the main contents of web pages and reflect certain taxonomy knowledge. Therefore, mining key information is significant in helping acquire domain knowledge and build catalogue classifiers. Current web content mining methods cannot mine such key information effectively. “Noise information” (such as advertisements) is a problem for the performance of web mining tasks. This paper proposes a method to extract key information out of web pages which contain noisy information. The method contains two steps: to extract a list of candidate key information, and then apply entropy measure to filter noisy information and discover key information. Experiment results show that this method is effective in discovering key information. With the discovered key information that reflects taxonomy knowledge, an application is developed to help ontology generation. }
}
@article{DueñasFernández2014129,
  title = {Detecting trends on the Web: A multidisciplinary approach },
  journal = {Information Fusion },
  volume = {20},
  number = {0},
  pages = {129 - 135},
  year = {2014},
  note = {},
  issn = {1566-2535},
  doi = {http://dx.doi.org/10.1016/j.inffus.2014.01.006},
  url = {http://www.sciencedirect.com/science/article/pii/S1566253514000116},
  author = {Rodrigo Dueñas-Fernández and Juan D. Velásquez and Gaston L’Huillier},
  keywords = {Trend detection},
  keywords = {Web opinion mining},
  keywords = {Topic modeling },
  abstract = {Abstract This paper introduces a framework for trend modeling and detection on the Web through the usage of Opinion Mining and Topic Modeling tools based on the fusion of freely available information. This framework consists of a four step model that runs periodically: crawl a set of predefined sources of documents; search for potential sources and extract topics from the retrieved documents; retrieve opinionated documents from social networks for each detected topic and extract sentiment information from them. The proposed framework was applied to a set of 20 sources of documents over a period of 8 months. After the analysis period and that the proposed experiments were run, an F-Measure of 0.56 was obtained for the detection of significant events, implying that the proposed framework is a feasible model of how trends could be represented through the analysis of documents freely available on the Web. }
}
@article{vanderAalst2007191,
  title = {Exploring the \{CSCW\} spectrum using process mining },
  journal = {Advanced Engineering Informatics },
  volume = {21},
  number = {2},
  pages = {191 - 199},
  year = {2007},
  note = {Ontology of Systems and Software Engineering; Techniques to Support Collaborative Engineering Environments },
  issn = {1474-0346},
  doi = {http://dx.doi.org/10.1016/j.aei.2006.05.002},
  url = {http://www.sciencedirect.com/science/article/pii/S1474034606000322},
  author = {Wil M.P. van der Aalst},
  keywords = {Process mining},
  keywords = {Business activity monitoring},
  keywords = {Business process intelligence},
  keywords = {\{CSCW\}},
  keywords = {Data mining },
  abstract = {Process mining techniques allow for extracting information from event logs. For example, the audit trails of a workflow management system or the transaction logs of an enterprise resource planning system can be used to discover models describing processes, organizations, and products. Traditionally, process mining has been applied to structured processes. In this paper, we argue that process mining can also be applied to less structured processes supported by computer supported cooperative work (CSCW) systems. In addition, the ProM framework is described. Using ProM a wide variety of process mining activities are supported ranging from process discovery and verification to conformance checking and social network analysis. }
}
@article{Adhikari20072312,
  title = {Enhancing quality of knowledge synthesized from multi-database mining },
  journal = {Pattern Recognition Letters },
  volume = {28},
  number = {16},
  pages = {2312 - 2324},
  year = {2007},
  note = {},
  issn = {0167-8655},
  doi = {http://dx.doi.org/10.1016/j.patrec.2007.07.017},
  url = {http://www.sciencedirect.com/science/article/pii/S0167865507002371},
  author = {Animesh Adhikari and P.R. Rao},
  keywords = {\{ACP\} coding},
  keywords = {Coding association rules},
  keywords = {Multi-database mining},
  keywords = {Space efficient representation of local association rules},
  keywords = {Quality of synthesized knowledge },
  abstract = {Multi-database mining using local pattern analysis could be considered as an approximate method of mining multiple large databases. Thus, it might be required to enhance the quality of knowledge synthesized from multiple databases. Also, many decision-making applications are directly based on the available local patterns in different databases. The quality of synthesized knowledge/decision based on local patterns in different databases could be enhanced by incorporating more local patterns in the knowledge synthesizing/processing activities. Thus, the available local patterns play a crucial role in building efficient multi-database mining applications. We represent patterns in condensed form by employing a coding called \{ACP\} coding. It allows us to consider more local patterns by lowering further the user inputs, like minimum support and minimum confidence. The proposed coding enables more local patterns participate in the knowledge synthesizing/processing activities and thus, the quality of synthesized knowledge based on local patterns in different databases gets enhanced significantly at a given pattern synthesizing algorithm and computing resource. }
}
@article{Blommaert2014667,
  title = {Data mining for longitudinal data under multicollinearity and time dependence using penalized generalized estimating equations },
  journal = {Computational Statistics & Data Analysis },
  volume = {71},
  number = {0},
  pages = {667 - 680},
  year = {2014},
  note = {},
  issn = {0167-9473},
  doi = {http://dx.doi.org/10.1016/j.csda.2013.02.023},
  url = {http://www.sciencedirect.com/science/article/pii/S0167947313000790},
  author = {A. Blommaert and N. Hens and Ph. Beutels},
  keywords = {Covariate selection},
  keywords = {Generalized estimating equations},
  keywords = {Longitudinal data},
  keywords = {Multicollinearity},
  keywords = {Penalization},
  keywords = {Time-dependent covariates },
  abstract = {Abstract Penalized generalized estimating equations with Elastic Net or L2-Smoothly Clipped Absolute Deviation penalization are proposed to simultaneously select the most important variables and estimate their effects for longitudinal Gaussian data when multicollinearity is present. The method is able to consistently select and estimate the main effects even when strong correlations are present. In addition, the potential pitfall of time-dependent covariates is clarified. Both asymptotic theory and simulation results reveal the effectiveness of penalization as a data mining tool for longitudinal data, especially when a large number of variables is present. The method is illustrated by mining for the main determinants of life expectancy in Europe. }
}
@article{Chen20081009,
  title = {Mining e-Learning domain concept map from academic articles },
  journal = {Computers & Education },
  volume = {50},
  number = {3},
  pages = {1009 - 1021},
  year = {2008},
  note = {},
  issn = {0360-1315},
  doi = {http://dx.doi.org/10.1016/j.compedu.2006.10.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0360131506001497},
  author = {Nian-Shing Chen and Kinshuk and Chun-Wang Wei and Hong-Jhe Chen},
  keywords = {Concept map},
  keywords = {Knowledge building},
  keywords = {e-Learning domain},
  keywords = {Adaptive learning},
  keywords = {Text mining },
  abstract = {Recent researches have demonstrated the importance of concept map and its versatile applications especially in e-Learning. For example, while designing adaptive learning materials, designers need to refer to the concept map of a subject domain. Moreover, concept maps can show the whole picture and core knowledge about a subject domain. Research from literature also suggests that graphical representation of domain knowledge can reduce the problems of information overload and learning disorientation for learners. However, construction of concept maps typically relied upon domain experts in the past; it is a time consuming and high cost task. Concept maps creation for emerging new domains such as e-Learning is even more challenging due to its ongoing development nature. The aim of this paper is to construct e-Learning domain concept maps from academic articles. We adopt some relevant journal articles and conference papers in e-Learning domain as data sources, and apply text-mining techniques to automatically construct concept maps for e-Learning domain. The constructed concept maps can provide a useful reference for researchers, who are new to the e-Leaning field, to study related issues, for teachers to design adaptive learning materials, and for learners to understand the whole picture of e-Learning domain knowledge. }
}
@article{Suominen2014127,
  title = {Text mining and information analysis of health documents },
  journal = {Artificial Intelligence in Medicine },
  volume = {61},
  number = {3},
  pages = {127 - 130},
  year = {2014},
  note = {Text Mining and Information Analysis of Health Documents },
  issn = {0933-3657},
  doi = {http://dx.doi.org/10.1016/j.artmed.2014.06.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0933365714000657},
  author = {Hanna Suominen}
}
@article{Chien2008280,
  title = {Data mining to improve personnel selection and enhance human capital: A case study in high-technology industry },
  journal = {Expert Systems with Applications },
  volume = {34},
  number = {1},
  pages = {280 - 290},
  year = {2008},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2006.09.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417406002776},
  author = {Chen-Fu Chien and Li-Fei Chen},
  keywords = {Personnel selection},
  keywords = {Human capital},
  keywords = {Data mining},
  keywords = {Decision tree},
  keywords = {Semiconductor industry },
  abstract = {The quality of human capital is crucial for high-tech companies to maintain competitive advantages in knowledge economy era. However, high-technology companies suffering from high turnover rates often find it hard to recruit the right talents. In addition to conventional human resource management approaches, there is an urgent need to develop effective personnel selection mechanism to find the talents who are the most suitable to their own organizations. This study aims to fill the gap by developing a data mining framework based on decision tree and association rules to generate useful rules for personnel selection. The results can provide decision rules relating personnel information with work performance and retention. An empirical study was conducted in a semiconductor company to support their hiring decision for indirect labors including engineers and managers with different job functions. The results demonstrated the practical viability of this approach. Moreover, based on discussions among domain experts and data miner, specific recruitment and human resource management strategies were created from the results. }
}
@article{Pan20071039,
  title = {Mining competent case bases for case-based reasoning },
  journal = {Artificial Intelligence },
  volume = {171},
  number = {16–17},
  pages = {1039 - 1068},
  year = {2007},
  note = {},
  issn = {0004-3702},
  doi = {http://dx.doi.org/10.1016/j.artint.2007.04.018},
  url = {http://www.sciencedirect.com/science/article/pii/S0004370207000938},
  author = {Rong Pan and Qiang Yang and Sinno Jialin Pan},
  keywords = {Case-based reasoning},
  keywords = {Case-base mining},
  keywords = {Competence},
  keywords = {\{KGCM\} },
  abstract = {Case-based reasoning relies heavily on the availability of a highly competent case base to make high-quality decisions. However, good case bases are difficult to come by. In this paper, we present a novel algorithm for automatically mining a high-quality case base from a raw case set that can preserve and sometimes even improve the competence of case-based reasoning. In this paper, we analyze two major problems in previous case-mining algorithms. The first problem is caused by noisy cases such that the nearest neighbor cases of a problem may not provide correct solutions. The second problem is caused by uneven case distribution, such that similar problems may have dissimilar solutions. To solve these problems, we develop a theoretical framework for the error bound in case-based reasoning, and propose a novel case-base mining algorithm guided by the theoretical results that returns a high-quality case base from raw data efficiently. We support our theory and algorithm with extensive empirical evaluation using different benchmark data sets. }
}
@article{deAmo2007401,
  title = {First-order temporal pattern mining with regular expression constraints },
  journal = {Data & Knowledge Engineering },
  volume = {62},
  number = {3},
  pages = {401 - 420},
  year = {2007},
  note = {Including special issue: 20th Brazilian Symposium on Databases (SBBD 2005) },
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2006.08.009},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X06001662},
  author = {Sandra de Amo and Daniel A. Furtado},
  keywords = {Temporal data mining},
  keywords = {Frequent sequential patterns},
  keywords = {Regular expression constraints},
  keywords = {Constraint-based mining },
  abstract = {Previous studies on mining sequential patterns have focused on temporal patterns specified by some form of propositional temporal logic. However, there are some interesting sequential patterns, such as the multi-sequential patterns, whose specification needs a more expressive formalism, the first-order temporal logic. Multi-sequential patterns appear in different application contexts, for instance in spatial census data mining, which is the target application of the study developed in this paper. We extend a well-known user-controlled tool, based on regular expressions constraints, to the multi-sequential pattern context. This specification tool enables the incorporation of user focus into the mining process. We present MSP-Miner, an Apriori-based algorithm to discover all frequent multi-sequential patterns satisfying a user-specified regular expression constraint. }
}
@article{Rahbarinia2014194,
  title = {PeerRush: Mining for unwanted \{P2P\} traffic },
  journal = {Journal of Information Security and Applications },
  volume = {19},
  number = {3},
  pages = {194 - 208},
  year = {2014},
  note = {},
  issn = {2214-2126},
  doi = {http://dx.doi.org/10.1016/j.jisa.2014.03.002},
  url = {http://www.sciencedirect.com/science/article/pii/S2214212614000143},
  author = {Babak Rahbarinia and Roberto Perdisci and Andrea Lanzi and Kang Li},
  keywords = {\{P2P\}},
  keywords = {Traffic classification},
  keywords = {Botnets },
  abstract = {Abstract In this paper we present PeerRush, a novel system for the identification of unwanted \{P2P\} traffic. Unlike most previous work, PeerRush goes beyond \{P2P\} traffic detection, and can accurately categorize the detected \{P2P\} traffic and attribute it to specific \{P2P\} applications, including malicious applications such as \{P2P\} botnets. PeerRush achieves these results without the need of deep packet inspection, and can accurately identify applications that use encrypted \{P2P\} traffic. We implemented a prototype version of PeerRush and performed an extensive evaluation of the system over a variety of \{P2P\} traffic datasets. Our results show that we can detect all the considered types of \{P2P\} traffic with up to 99.5% true positives and 0.1% false positives. Furthermore, PeerRush can attribute the \{P2P\} traffic to a specific \{P2P\} application with a misclassification rate of 0.68% or less. }
}
@article{Chen200765,
  title = {Mining frequent tree-like patterns in large datasets },
  journal = {Data & Knowledge Engineering },
  volume = {62},
  number = {1},
  pages = {65 - 83},
  year = {2007},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2006.07.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X06001364},
  author = {Tzung-Shi Chen and Shih-Chun Hsu},
  keywords = {Data mining},
  keywords = {Frequent patterns},
  keywords = {Sequential patterns},
  keywords = {Tree-like patterns},
  keywords = {World wide web },
  abstract = {Sequential pattern mining is crucial to data mining domains. This paper proposes a novel data mining approach for exploring hierarchical tree structures, named tree-like patterns, representing the relationships for a pair of items in a sequence. Using tree-like patterns, the relationships for a pair of items can be identified in terms of the cause and effect. A novel technique that efficiently counts support values for tree-like patterns using a queue structure is proposed. In addition, this paper addresses an efficient scheme for determining the frequency of a tree-like pattern in a sequence using a dynamic programming approach. Each tree-like pattern embedded in a sequence is considered to have a certain valuable meaning or the degree of importance used in different applications. Two addressed formulas are applied to determine the degree of significance for a specific sequence, which denotes the degree of consecutive items in a tree-like pattern for a sequence. The larger the degree of significance a tree-like pattern has, the more the tree-like pattern is compacted in the sequence. The characteristics differentiating the explored patterns from those obtained with other schemes are discussed. A simulation analysis of the proposed data mining approach is utilized to demonstrate its efficacy. Finally, the proposed approach is designed and implemented in a data mining system integrated into a novel e-learning platform. }
}
@article{Cordeiro2014211,
  title = {QuMinS: Fast and scalable querying, mining and summarizing multi-modal databases },
  journal = {Information Sciences },
  volume = {264},
  number = {0},
  pages = {211 - 229},
  year = {2014},
  note = {Serious Games },
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2013.11.013},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025513008001},
  author = {Robson L.F. Cordeiro and Fan Guo and Donna S. Haverkamp and James H. Horne and Ellen K. Hughes and Gunhee Kim and Luciana A.S. Romani and Priscila P. Coltri and Tamires T. Souza and Agma J.M. Traina and Caetano Traina Jr. and Christos Faloutsos},
  keywords = {Low-labor labeling},
  keywords = {Summarization},
  keywords = {Outlier detection},
  keywords = {Query by example},
  keywords = {Clustering},
  keywords = {Satellite imagery },
  abstract = {Abstract Given a large image set, in which very few images have labels, how to guess labels for the remaining majority? How to spot images that need brand new labels different from the predefined ones? How to summarize these data to route the user’s attention to what really matters? Here we answer all these questions. Specifically, we propose QuMinS, a fast, scalable solution to two problems: (i) Low-labor labeling (LLL) – given an image set, very few images have labels, find the most appropriate labels for the rest; and (ii) Mining and attention routing – in the same setting, find clusters, the top- N O outlier images, and the N R images that best represent the data. Experiments on satellite images spanning up to 2.25 \{GB\} show that, contrasting to the state-of-the-art labeling techniques, QuMinS scales linearly on the data size, being up to 40 times faster than top competitors (GCap), still achieving better or equal accuracy, it spots images that potentially require unpredicted labels, and it works even with tiny initial label sets, i.e., nearly five examples. We also report a case study of our method’s practical usage to show that QuMinS is a viable tool for automatic coffee crop detection from remote sensing images. }
}
@article{Hashemi2014384,
  title = {Mining a Persian–English comparable corpus for cross-language information retrieval },
  journal = {Information Processing & Management },
  volume = {50},
  number = {2},
  pages = {384 - 398},
  year = {2014},
  note = {},
  issn = {0306-4573},
  doi = {http://dx.doi.org/10.1016/j.ipm.2013.10.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0306457313001027},
  author = {Homa B. Hashemi and Azadeh Shakery},
  keywords = {Comparable corpora},
  keywords = {Cross-language information retrieval},
  keywords = {Term association network},
  keywords = {Translation validity check },
  abstract = {Abstract Knowledge acquisition and bilingual terminology extraction from multilingual corpora are challenging tasks for cross-language information retrieval. In this study, we propose a novel method for mining high quality translation knowledge from our constructed Persian–English comparable corpus, University of Tehran Persian–English Comparable Corpus (UTPECC). We extract translation knowledge based on Term Association Network (TAN) constructed from term co-occurrences in same language as well as term associations in different languages. We further propose a post-processing step to do term translation validity check by detecting the mistranslated terms as outliers. Evaluation results on two different data sets show that translating queries using \{UTPECC\} and using the proposed methods significantly outperform simple dictionary-based methods. Moreover, the experimental results show that our methods are especially effective in translating Out-Of-Vocabulary terms and also expanding query words based on their associated terms. }
}
@article{Banville200635,
  title = {Mining chemical structural information from the drug literature },
  journal = {Drug Discovery Today },
  volume = {11},
  number = {1–2},
  pages = {35 - 42},
  year = {2006},
  note = {},
  issn = {1359-6446},
  doi = {http://dx.doi.org/10.1016/S1359-6446(05)03682-2},
  url = {http://www.sciencedirect.com/science/article/pii/S1359644605036822},
  author = {Debra L. Banville},
  keywords = {chemical informatics},
  keywords = {chemical text mining},
  keywords = {chemical entity extraction},
  keywords = {chemical name recognition},
  keywords = {chemical structure recognition},
  keywords = {chemical optical character recognition },
  abstract = {It is easier to find too many documents on a life science topic than to find the right information inside these documents. With the application of text data mining to biological documents, it is no surprise that researchers are starting to look at applications that mine out chemical information. The mining of chemical entities – names and structures – brings with it some unique challenges, which commercial and academic efforts are beginning to address. Ultimately, life science text data mining applications need to focus on the marriage of biological and chemical information. }
}
@article{Tseng200741,
  title = {Efficient mining of generalized association rules with non-uniform minimum support },
  journal = {Data & Knowledge Engineering },
  volume = {62},
  number = {1},
  pages = {41 - 64},
  year = {2007},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2006.07.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X06001352},
  author = {Ming-Cheng Tseng and Wen-Yang Lin},
  keywords = {Data mining},
  keywords = {Generalized association rules},
  keywords = {Multiple minimum supports},
  keywords = {Taxonomy },
  abstract = {Mining generalized association rules between items in the presence of taxonomies has been recognized as an important model in data mining. Earlier work on generalized association rules confined the minimum supports to be uniformly specified for all items or for items within the same taxonomy level. This constraint on minimum support would restrain an expert from discovering some deviations or exceptions that are more interesting but much less supported than general trends. In this paper, we extended the scope of mining generalized association rules in the presence of taxonomies to allow any form of user-specified multiple minimum supports. We discuss the problems of using classic Apriori itemset generation and presented two algorithms, MMS_Cumulate and MMS_Stratify, for discovering the generalized frequent itemsets. Empirical evaluation showed that these two algorithms are very effective and have good linear scale-up characteristics. }
}
@article{Ren2014291,
  title = {Guest Editorial-Special Issue on Green Mining },
  journal = {International Journal of Mining Science and Technology },
  volume = {24},
  number = {3},
  pages = {291 - 292},
  year = {2014},
  note = {Special Issue on Green Mining },
  issn = {2095-2686},
  doi = {http://dx.doi.org/10.1016/j.ijmst.2014.03.022},
  url = {http://www.sciencedirect.com/science/article/pii/S2095268614000652},
  author = {Ting Ren and Jialin Xu}
}
@article{Park2008512,
  title = {Sequence-based clustering for Web usage mining: A new experimental framework and ANN-enhanced K-means algorithm },
  journal = {Data & Knowledge Engineering },
  volume = {65},
  number = {3},
  pages = {512 - 543},
  year = {2008},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2008.01.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X08000104},
  author = {Sungjune Park and Nallan C. Suresh and Bong-Keun Jeong},
  keywords = {Web usage mining},
  keywords = {Clustering methods},
  keywords = {Simulation},
  keywords = {Artificial intelligence},
  keywords = {Markov chain },
  abstract = {We develop a general sequence-based clustering method by proposing new sequence representation schemes in association with Markov models. The resulting sequence representations allow for calculation of vector-based distances (dissimilarities) between Web user sessions and thus can be used as inputs of various clustering algorithms. We develop an evaluation framework in which the performances of the algorithms are compared in terms of whether the clusters (groups of Web users who follow the same Markov process) are correctly identified using a replicated clustering approach. A series of experiments is conducted to investigate whether clustering performance is affected by different sequence representations and different distance measures as well as by other factors such as number of actual Web user clusters, number of Web pages, similarity between clusters, minimum session length, number of user sessions, and number of clusters to form. A new, fuzzy ART-enhanced K-means algorithm is also developed and its superior performance is demonstrated. }
}
@article{McDonald200780,
  title = {Sediment radioisotope dating across a stratigraphic discontinuity in a mining-impacted lake },
  journal = {Journal of Environmental Radioactivity },
  volume = {92},
  number = {2},
  pages = {80 - 95},
  year = {2007},
  note = {},
  issn = {0265-931X},
  doi = {http://dx.doi.org/10.1016/j.jenvrad.2006.09.009},
  url = {http://www.sciencedirect.com/science/article/pii/S0265931X06001688},
  author = {C.P. McDonald and N.R. Urban},
  keywords = {Sediment},
  keywords = {Radioisotope},
  keywords = {Dating},
  keywords = {\{CRS\}},
  keywords = {210Pb},
  keywords = {137Cs},
  keywords = {Mining },
  abstract = {Application of radioisotope sediment dating models to lakes subjected to large anthropogenic sediment inputs can be problematic. As a result of copper mining activities, Torch Lake received large volumes of sediment, the characteristics of which were dramatically different from those of the native sediment. Commonly used dating models (CIC-CSR, CRS) were applied to Torch Lake, but assumptions of these methods are violated, rendering sediment geochronologies inaccurate. A modification was made to the \{CRS\} model, utilizing a distinct horizon separating mining from post-mining sediment to differentiate between two focusing regimes. 210Pb inventories in post-mining sediment were adjusted to correspond to those in mining-era sediment, and a sediment geochronology was established and verified using independent markers in 137Cs accumulation profiles and core X-rays. }
}
@article{Zhao2006627,
  title = {\{XML\} structural delta mining: Issues and challenges },
  journal = {Data & Knowledge Engineering },
  volume = {59},
  number = {3},
  pages = {627 - 651},
  year = {2006},
  note = {Including: \{ER\} 2003 Selection of papers presented at the 22nd International Conference on Conceptual Modeling 22nd International Conference on Conceptual Modeling },
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2005.10.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X05001655},
  author = {Qiankun Zhao and Ling Chen and Sourav S. Bhowmick and Sanjay Madria},
  keywords = {Versions of \{XML\} documents},
  keywords = {Structural delta},
  keywords = {Dynamic metrics},
  keywords = {\{XML\} structural delta mining},
  keywords = {Research issues},
  keywords = {Applications },
  abstract = {Recently, there is an increasing research efforts in \{XML\} data mining. These research efforts largely assumed that \{XML\} documents are static. However, in reality, the documents are rarely static. In this paper, we propose a novel research problem called \{XML\} structural delta mining. The objective of \{XML\} structural delta mining is to discover knowledge by analyzing structural evolution pattern (also called structural delta) of history of \{XML\} documents. Unlike existing approaches, \{XML\} structural delta mining focuses on the dynamic and temporal features of \{XML\} data. Furthermore, the data source for this novel mining technique is a sequence of historical versions of an \{XML\} document rather than a set of snapshot \{XML\} documents. Such mining technique can be useful in many applications such as change detection for very large \{XML\} documents, efficient \{XML\} indexing, \{XML\} search engine, etc. Our aim in this paper is not to provide a specific solution to a particular mining problem. Rather, we present the vision of the mining framework and present the issues and challenges for three types of \{XML\} structural delta mining: identifying various interesting structures, discovering association rules from structural deltas, and structural change pattern-based classification. }
}
@article{Zhou200787,
  title = {Integrative mining of traditional Chinese medicine literature and \{MEDLINE\} for functional gene networks },
  journal = {Artificial Intelligence in Medicine },
  volume = {41},
  number = {2},
  pages = {87 - 104},
  year = {2007},
  note = {Integrative data mining in systems biology },
  issn = {0933-3657},
  doi = {http://dx.doi.org/10.1016/j.artmed.2007.07.007},
  url = {http://www.sciencedirect.com/science/article/pii/S0933365707000942},
  author = {Xuezhong Zhou and Baoyan Liu and Zhaohui Wu and Yi Feng},
  keywords = {Integrative data mining},
  keywords = {Functional gene network},
  keywords = {Traditional Chinese medicine literature},
  keywords = {\{MEDLINE\}},
  keywords = {Text mining },
  abstract = {SummaryObjective The amount of biomedical data in different disciplines is growing at an exponential rate. Integrating these significant knowledge sources to generate novel hypotheses for systems biology research is difficult. Traditional Chinese medicine (TCM) is a completely different discipline, and is a complementary knowledge system to modern biomedical science. This paper uses a significant \{TCM\} bibliographic literature database in China, together with MEDLINE, to help discover novel gene functional knowledge. Materials and methods We present an integrative mining approach to uncover the functional gene relationships from \{MEDLINE\} and \{TCM\} bibliographic literature. This paper introduces \{TCM\} literature (about 50,000 records) as one knowledge source for constructing literature-based gene networks. We use the \{TCM\} diagnosis, \{TCM\} syndrome, to automatically congregate the related genes. The syndrome–gene relationships are discovered based on the syndrome–disease relationships extracted from \{TCM\} literature and the disease–gene relationships in MEDLINE. Based on the bubble-bootstrapping and relation weight computing methods, we have developed a prototype system called MeDisco/3S, which has name entity and relation extraction, and online analytical processing (OLAP) capabilities, to perform the integrative mining process. Results We have got about 200,000 syndrome–gene relations, which could help generate syndrome-based gene networks, and help analyze the functional knowledge of genes from syndrome perspective. We take the gene network of Kidney–Yang Deficiency syndrome (KYD syndrome) and the functional analysis of some genes, such as \{CRH\} (corticotropin releasing hormone), \{PTH\} (parathyroid hormone), \{PRL\} (prolactin), \{BRCA1\} (breast cancer 1, early onset) and \{BRCA2\} (breast cancer 2, early onset), to demonstrate the preliminary results. The underlying hypothesis is that the related genes of the same syndrome will have some biological functional relationships, and will constitute a functional network. Conclusion This paper presents an approach to integrate \{TCM\} literature and modern biomedical data to discover novel gene networks and functional knowledge of genes. The preliminary results show that the novel gene functional knowledge and gene networks, which are worthy of further investigation, could be generated by integrating the two complementary biomedical data sources. It will be a promising research field through integrative mining of \{TCM\} and modern life science literature. }
}
@article{Mohamadi20081824,
  title = {Data mining with a simulated annealing based fuzzy classification system },
  journal = {Pattern Recognition },
  volume = {41},
  number = {5},
  pages = {1824 - 1833},
  year = {2008},
  note = {},
  issn = {0031-3203},
  doi = {http://dx.doi.org/10.1016/j.patcog.2007.11.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0031320307004931},
  author = {Hamid Mohamadi and Jafar Habibi and Mohammad Saniee Abadeh and Hamid Saadi},
  keywords = {Simulated annealing},
  keywords = {Data mining},
  keywords = {Pattern classification},
  keywords = {Fuzzy systems},
  keywords = {Fuzzy rule extraction },
  abstract = {In this paper, the use of simulated annealing (SA) metaheuristic for constructing a fuzzy classification system is presented. In several previous investigations, the capability of fuzzy systems to solve different kinds of problems has been demonstrated. Simulated annealing based fuzzy classification system (SAFCS), hybridizes the learning capability of \{SA\} metaheuristic with the approximate reasoning method of fuzzy systems. The objective of this paper is to illustrate the ability of \{SA\} to develop an accurate fuzzy classifier. The use of \{SA\} in classification is an attempt to effectively explore and exploit the large search space usually associated with classification problems, and find the optimum set of fuzzy if–then rules. The \{SAFCS\} would be capable to extract accurate fuzzy classification rules from input data sets, and applies them to classify new data instances in different predefined groups or classes. Experiments are performed with eight \{UCI\} data sets. The results indicate that the proposed \{SAFCS\} achieves competitive results in comparison with several well-known classification algorithms. }
}
@article{Kirkos2007995,
  title = {Data Mining techniques for the detection of fraudulent financial statements },
  journal = {Expert Systems with Applications },
  volume = {32},
  number = {4},
  pages = {995 - 1003},
  year = {2007},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2006.02.016},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417406000765},
  author = {Efstathios Kirkos and Charalambos Spathis and Yannis Manolopoulos},
  keywords = {Fraudulent financial statements},
  keywords = {Management fraud},
  keywords = {Data Mining},
  keywords = {Auditing},
  keywords = {Greece },
  abstract = {This paper explores the effectiveness of Data Mining (DM) classification techniques in detecting firms that issue fraudulent financial statements (FFS) and deals with the identification of factors associated to FFS. In accomplishing the task of management fraud detection, auditors could be facilitated in their work by using Data Mining techniques. This study investigates the usefulness of Decision Trees, Neural Networks and Bayesian Belief Networks in the identification of fraudulent financial statements. The input vector is composed of ratios derived from financial statements. The three models are compared in terms of their performances. }
}
@article{Huang2007441,
  title = {Mining knowledge from object-oriented instances },
  journal = {Expert Systems with Applications },
  volume = {33},
  number = {2},
  pages = {441 - 450},
  year = {2007},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2006.05.029},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417406001606},
  author = {Cheng-Ming Huang and Tzung-Pei Hong and Shi-Jinn Horng},
  keywords = {Association rule},
  keywords = {Data mining},
  keywords = {Object transaction},
  keywords = {Object-oriented mining },
  abstract = {Data mining is the process of extracting desirable knowledge or interesting patterns from existing databases for specific purposes. Recently, the object concept has been very popular and used in a variety of applications, especially for complex data description. This paper thus proposes a new data-mining algorithm for extracting interesting knowledge from transactions stored as object data. Each item itself is thought of as a class, and each item purchased in a transaction is thought of as an instance. Instances with the same class (item name) may have different attribute values since they may appear in different transactions. The proposed algorithm is divided into two main phases, one for intra-object association rules, and the other for inter-object association rules. Two apriori-like procedures are adopted to find the two kinds of rules. The first phase finds out the association relation within the same kind of objects. Each large itemset found in this phase can be thought of as a composite item used in phase 2. The second phase then finds the relationship among different kinds of objects. Both the intra-object and inter-object association rules can thus be easily derived by the proposed algorithm at the same time. Experiments are also made to show the effect of the proposed algorithm. }
}
@article{Demšar2007551,
  title = {Investigating visual exploration of geospatial data: An exploratory usability experiment for visual data mining },
  journal = {Computers, Environment and Urban Systems },
  volume = {31},
  number = {5},
  pages = {551 - 571},
  year = {2007},
  note = {Geospatial Analysis and Modeling },
  issn = {0198-9715},
  doi = {http://dx.doi.org/10.1016/j.compenvurbsys.2007.08.006},
  url = {http://www.sciencedirect.com/science/article/pii/S0198971507000579},
  author = {Urška Demšar},
  keywords = {Exploratory geovisualisation},
  keywords = {Visual data mining},
  keywords = {Exploratory usability },
  abstract = {This study presents a small exploratory usability experiment with the goal to observe how people visually explore geospatial data. The well-known iris dataset from pattern recognition was put into geographical context for this experiment, in order to provide the participants with a dataset with easily observable spatial and other relationships. The participants were given free hand to explore this dataset with a visual data mining system in any way they liked. The protocols collected during the experiment with the thinking-aloud method were analysed with the aim to understand what types of hypotheses the participants formed, which visualisations they used to either derive, confirm or reject their hypotheses and what exploration strategies they adopted. }
}
@article{Liu2007304,
  title = {Combined mining of Web server logs and web contents for classifying user navigation patterns and predicting users’ future requests },
  journal = {Data & Knowledge Engineering },
  volume = {61},
  number = {2},
  pages = {304 - 330},
  year = {2007},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2006.06.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X06000954},
  author = {Haibin Liu and Vlado Kešelj},
  keywords = {Web usage mining},
  keywords = {Web content mining},
  keywords = {User navigation profiles},
  keywords = {Classification},
  keywords = {Prediction },
  abstract = {We present a study of the automatic classification of web user navigation patterns and propose a novel approach to classifying user navigation patterns and predicting users’ future requests. The approach is based on the combined mining of Web server logs and the contents of the retrieved web pages. The textual content of web pages is captured through extraction of character N-grams, which are combined with Web server log files to derive user navigation profiles. The approach is implemented as an experimental system, and its performance is evaluated based on two tasks: classification and prediction. The system achieves the classification accuracy of nearly 70% and the prediction accuracy of about 65%, which is about 20% higher than the classification accuracy by mining Web server logs alone. This approach may be used to facilitate better web personalization and website organization. }
}
@article{Li2007485,
  title = {A rough sets based characteristic relation approach for dynamic attribute generalization in data mining },
  journal = {Knowledge-Based Systems },
  volume = {20},
  number = {5},
  pages = {485 - 494},
  year = {2007},
  note = {Intelligent Knowledge Engineering Systems 2006 International Conference on Intelligent Systems and Engineering },
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2007.01.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705107000111},
  author = {Tianrui Li and Da Ruan and Wets Geert and Jing Song and Yang Xu},
  keywords = {Rough sets},
  keywords = {Knowledge discovery},
  keywords = {Data mining},
  keywords = {Incomplete information systems },
  abstract = {Any attribute set in an information system may be evolving in time when new information arrives. Approximations of a concept by rough set theory need updating for data mining or other related tasks. For incremental updating approximations of a concept, methods using the tolerance relation and similarity relation have been previously studied in literature. The characteristic relation-based rough sets approach provides more informative results than the tolerance-and-similarity relation based approach. In this paper, an attribute generalization and its relation to feature selection and feature extraction are firstly discussed. Then, a new approach for incrementally updating approximations of a concept is presented under the characteristic relation-based rough sets. Finally, the approach of direct computation of rough set approximations and the proposed approach of dynamic maintenance of rough set approximations are employed for performance comparison. An extensive experimental evaluation on a large soybean database from \{MLC\} shows that the proposed approach effectively handles a dynamic attribute generalization in data mining. }
}
@article{Pérez200742,
  title = {Design and implementation of a data mining grid-aware architecture },
  journal = {Future Generation Computer Systems },
  volume = {23},
  number = {1},
  pages = {42 - 47},
  year = {2007},
  note = {},
  issn = {0167-739X},
  doi = {http://dx.doi.org/10.1016/j.future.2006.04.008},
  url = {http://www.sciencedirect.com/science/article/pii/S0167739X06000732},
  author = {María S. Pérez and Alberto Sánchez and Víctor Robles and Pilar Herrero and José M. Peña},
  keywords = {Data mining},
  keywords = {Data mining grid architecture (DMGA)},
  keywords = {WekaG },
  abstract = {Current business processes often use data from several sources. Data is characterized to be heterogeneous, incomplete and usually involves a huge amount of records. This implies that data must be transformed in a set of patterns, rules or some kind of formalism, which helps to understand the underlying information. The participation of several organizations in this process makes the assimilation of data more difficult. Data mining is a widely used approach for the transformation of data to useful patterns, aiding the comprehensive knowledge of the concrete domain information. Nevertheless, traditional data mining techniques find difficulties in their application on current scenarios, due to the complexity previously mentioned. Data Mining Grid tries to fix these problems, allowing data mining process to be deployed in a grid environment, in which data and services resources are geographically distributed, belong to several virtual organizations and the security can be flexibly solved. We propose both a novel architecture for Data Mining Grid, named DMGA, and the implementation of this architecture, named WekaG. }
}
@article{Wang2014191,
  title = {Grey System Theory based prediction for topic trend on Internet },
  journal = {Engineering Applications of Artificial Intelligence },
  volume = {29},
  number = {0},
  pages = {191 - 200},
  year = {2014},
  note = {},
  issn = {0952-1976},
  doi = {http://dx.doi.org/10.1016/j.engappai.2013.12.005},
  url = {http://www.sciencedirect.com/science/article/pii/S095219761300239X},
  author = {Xingqi Wang and Lei Qi and Chan Chen and Jingfan Tang and Ming Jiang},
  keywords = {Topic trend prediction},
  keywords = {Grey System Theory},
  keywords = {Grey Verhulst Model },
  abstract = {Abstract Techniques extracting topics from dynamic Internet are relatively matured. However, people cannot accurately predict topic trend so far. Unfortunately, for prediction of topic trend, the availability of data is always very limited owing to the short life circle of topics, especially in such a highly efficient and fast-paced era. Based on Grey Verhulst Model, the paper presents an algorithm to predict topics trend. The principle of Grey Model for prediction application is analyzed and Grey Verhulst Model is established. In the meanwhile, real-world data from Youku (the largest video site in China and something like YouTube) is applied to test our presented algorithm. The average relative error of Grey Verhulst Model is less than 3%. The results show that Grey Verhulst Model has a higher prediction precision. The main contributions of this paper are as follows. First, we introduce Grey System Theory (GST) originated from system theory to the prediction of topics trend and to some extent, solve the problem with a high accuracy; second, to the best of our knowledge, it is the first attempt to employ \{GST\} in the field of topic trend prediction. }
}
@article{Verstak201366,
  title = {Using hierarchical data mining to characterize performance of wireless system configurations },
  journal = {Advances in Engineering Software },
  volume = {65},
  number = {0},
  pages = {66 - 77},
  year = {2013},
  note = {},
  issn = {0965-9978},
  doi = {http://dx.doi.org/10.1016/j.advengsoft.2013.05.012},
  url = {http://www.sciencedirect.com/science/article/pii/S0965997813000902},
  author = {Alex Verstak and Naren Ramakrishnan and Layne T. Watson and Jian He and Clifford A. Shaffer and Ananth Y. Grama},
  keywords = {Wideband code division multiple access},
  keywords = {Bit error probability},
  keywords = {Simulation database analysis},
  keywords = {Performance assessment },
  abstract = {Abstract This paper presents a statistical framework for assessing wireless systems performance using hierarchical data mining techniques. We consider \{WCDMA\} (wideband code division multiple access) systems with two-branch \{STTD\} (space time transmit diversity) and 1/2 rate convolutional coding (forward error correction codes). Monte Carlo simulation estimates the bit error probability (BEP) of the system across a wide range of signal-to-noise ratios (SNRs). A performance database of simulation runs is collected over a targeted space of system configurations. This database is then mined to obtain regions of the configuration space that exhibit acceptable average performance. The shape of the mined regions illustrates the joint influence of configuration parameters on system performance. The role of data mining in this application is to provide explainable and statistically valid design conclusions. The research issue is to define statistically meaningful aggregation of data in a manner that permits efficient and effective data mining algorithms. We achieve a good compromise between these goals and help establish the applicability of data mining for characterizing wireless systems performance. }
}
@article{Wu2006574,
  title = {Mining web navigations for intelligence },
  journal = {Decision Support Systems },
  volume = {41},
  number = {3},
  pages = {574 - 591},
  year = {2006},
  note = {Intelligence and security informatics },
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/j.dss.2004.06.011},
  url = {http://www.sciencedirect.com/science/article/pii/S0167923604001320},
  author = {Harris Wu and Michael Gordon and Kurtis DeMaagd and Weiguo Fan},
  keywords = {Principal clusters analysis},
  keywords = {Intelligence},
  keywords = {Mining},
  keywords = {Trend analysis},
  keywords = {Navigation analysis},
  keywords = {Information overload},
  keywords = {Web community },
  abstract = {The Internet is one of the fastest growing areas of intelligence gathering. We present a statistical approach, called principal clusters analysis, for analyzing millions of user navigations on the Web. This technique identifies prominent navigation clusters on different topics. Furthermore, it can determine information items that are useful starting points to explore a topic, as well as key documents to explore the topic in greater detail. Trends can be detected by observing navigation prominence over time. We apply this technique on a large popular website. The results show promise in web intelligence mining. }
}
@article{Takita2014137,
  title = {Pseudo-right dislocation, the bare-topic construction, and hanging topic constructions },
  journal = {Lingua },
  volume = {140},
  number = {0},
  pages = {137 - 157},
  year = {2014},
  note = {},
  issn = {0024-3841},
  doi = {http://dx.doi.org/10.1016/j.lingua.2013.12.010},
  url = {http://www.sciencedirect.com/science/article/pii/S0024384113002647},
  author = {Kensuke Takita},
  keywords = {Right dislocation},
  keywords = {Hanging topic construction},
  keywords = {Case-markers/postpositions},
  keywords = {Japanese},
  keywords = {Romance },
  abstract = {Abstract This paper first argues that Japanese right dislocation, where a constituent appears in the post-verbal position, is derived in two ways, depending on whether the dislocated constituent is accompanied with Case-markers/postpositions or not. In particular, it is argued that while right dislocation involves clausal ellipsis when the dislocated element is accompanied with Case-markers/postpositions, it is derived from what is called the bare-topic construction when the dislocated element is not accompanied with Case-markers/postpositions. Then, it is illustrated that the bare-topic construction, where the topic element is base-generated in the sentence-initial position without any Case-marker, postposition, or the topic-marker -wa, has close similarities with Hanging Topic constructions found in Romance and other languages. Claiming that the bare-topic construction should be equated with Hanging Topic constructions, this paper argues that an investigation of the properties of Japanese right dislocation makes it possible to contribute to a deeper understanding of the nature of the bare-topic construction, which in turn opens a novel way of comparing Japanese with other languages in terms of the syntax of topics. }
}
@article{Cedano2014930,
  title = {Solar Energy Research in Ibero-America, a Citation Mining Approach },
  journal = {Energy Procedia },
  volume = {57},
  number = {0},
  pages = {930 - 939},
  year = {2014},
  note = {2013 \{ISES\} Solar World Congress },
  issn = {1876-6102},
  doi = {http://dx.doi.org/10.1016/j.egypro.2014.10.075},
  url = {http://www.sciencedirect.com/science/article/pii/S1876610214014428},
  author = {Karla G. Cedano and Karla F. Ricalde and J. Antonio del Río and Manuel Martínez},
  keywords = {Solar energy},
  keywords = {prospective analysis},
  keywords = {scientometrics },
  abstract = {Abstract In this paper we present an analysis of all the research papers published on journals registered in the Web of Science, with the phrase “solar energy” in its title, abstract or keywords that have at least one author with address in Ibero-America. We present the results of citation mining applied to all such records published between 2002 and 2012. This analysis characterizes the behavior of the scientific production on solar energy in the most prolific countries in this region. }
}
@article{Yan201498,
  title = {Research dynamics: Measuring the continuity and popularity of research topics },
  journal = {Journal of Informetrics },
  volume = {8},
  number = {1},
  pages = {98 - 110},
  year = {2014},
  note = {},
  issn = {1751-1577},
  doi = {http://dx.doi.org/10.1016/j.joi.2013.10.010},
  url = {http://www.sciencedirect.com/science/article/pii/S1751157713000874},
  author = {Erjia Yan},
  keywords = {Topic analysis},
  keywords = {Networks},
  keywords = {Popularity},
  keywords = {Continuity},
  keywords = {Dynamics },
  abstract = {Abstract Dynamic development is an intrinsic characteristic of research topics. To study this, this paper proposes two sets of topic attributes to examine topic dynamic characteristics: topic continuity and topic popularity. Topic continuity comprises six attributes: steady, concentrating, diluting, sporadic, transforming, and emerging topics; topic popularity comprises three attributes: rising, declining, and fluctuating topics. These attributes are applied to a data set on library and information science publications during the past 11 years (2001–2011). Results show that topics on “web information retrieval”, “citation and bibliometrics”, “system and technology”, and “health science” have the highest average popularity; topics on “h-index”, “online communities”, “data preservation”, “social media”, and “web analysis” are increasingly becoming popular in library and information science. }
}
@article{Chien2007192,
  title = {Data mining for yield enhancement in semiconductor manufacturing and an empirical study },
  journal = {Expert Systems with Applications },
  volume = {33},
  number = {1},
  pages = {192 - 198},
  year = {2007},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2006.04.014},
  url = {http://www.sciencedirect.com/science/article/pii/S095741740600131X},
  author = {Chen-Fu Chien and Wen-Chih Wang and Jen-Chieh Cheng},
  keywords = {Data mining},
  keywords = {Decision tree},
  keywords = {Clustering},
  keywords = {Defect diagnosis},
  keywords = {Yield enhancement},
  keywords = {Semiconductor manufacturing },
  abstract = {During wafer fabrication, process data, equipment data, and lot history will be automatically or semi-automatically recorded and accumulated in database for monitoring the process, diagnosing faults, and managing manufacturing. However, in high-tech industry such as semiconductor manufacturing, many factors that are interrelated affect the yield of fabricated wafers. Engineers who rely on personal domain knowledge cannot find possible root causes of defects rapidly and effectively. This study aims to develop a framework for data mining and knowledge discovery from database that consists of a Kruskal–Wallis test, K-means clustering, and the variance reduction splitting criterion to investigate the huge amount of semiconductor manufacturing data and infer possible causes of faults and manufacturing process variations. The extracted information and knowledge is helpful to engineers as a basis for trouble shooting and defect diagnosis. We validated this approach with an empirical study in a semiconductor foundry company in Taiwan and the results demonstrated the practical viability of this approach. }
}
@article{Abulaish2007228,
  title = {Biological relation extraction and query answering from \{MEDLINE\} abstracts using ontology-based text mining },
  journal = {Data & Knowledge Engineering },
  volume = {61},
  number = {2},
  pages = {228 - 262},
  year = {2007},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2006.06.007},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X06000929},
  author = {Muhammad Abulaish and Lipika Dey},
  keywords = {Text mining},
  keywords = {Ontology},
  keywords = {Biological relation extraction},
  keywords = {Biological query processing },
  abstract = {The rapid growth of the biological text data repository makes it difficult for human beings to access required information in a convenient and effective manner. The problem arises due to the fact that most of the information is embedded within unstructured or semi-structured text that computers cannot interpret very easily. In this paper we have presented an ontology-based Biological Information Extraction and Query Answering (BIEQA) System, which initiates text mining with a set of concepts stored in a biological ontology, and thereafter mines possible biological relations among those concepts using \{NLP\} techniques and co-occurrence-based analysis. The system extracts all frequently occurring biological relations among a pair of biological concepts through text mining. A mined relation is associated to a fuzzy membership value, which is proportional to its frequency of occurrence in the corpus and is termed a fuzzy biological relation. The fuzzy biological relations extracted from a text corpus along with other relevant information components like biological entities occurring within a relation, are stored in a database. The database is integrated with a query-processing module. The query-processing module has an interface, which guides users to formulate biological queries at different levels of specificity. }
}
@article{Plasse2007596,
  title = {Combined use of association rules mining and clustering methods to find relevant links between binary rare attributes in a large data set },
  journal = {Computational Statistics & Data Analysis },
  volume = {52},
  number = {1},
  pages = {596 - 613},
  year = {2007},
  note = {},
  issn = {0167-9473},
  doi = {http://dx.doi.org/10.1016/j.csda.2007.02.020},
  url = {http://www.sciencedirect.com/science/article/pii/S0167947307000904},
  author = {Marie Plasse and Ndeye Niang and Gilbert Saporta and Alexandre Villeminot and Laurent Leblond},
  keywords = {Association rules mining},
  keywords = {Variable clustering},
  keywords = {Large sparse matrix},
  keywords = {Binary attributes},
  keywords = {Rule relevancy index },
  abstract = {A method to analyse links between binary attributes in a large sparse data set is proposed. Initially the variables are clustered to obtain homogeneous clusters of attributes. Association rules are then mined in each cluster. A graphical comparison of some rule relevancy indexes is presented. It is used to extract best rules depending on the application concerned. The proposed methodology is illustrated by an industrial application from the automotive industry with more than 80 000 vehicles each described by more than 3000 rare attributes. }
}
@article{Zirn201438,
  title = {Multidimensional topic analysis in political texts },
  journal = {Data & Knowledge Engineering },
  volume = {90},
  number = {0},
  pages = {38 - 53},
  year = {2014},
  note = {Special Issue on Natural Language Processing and Information Systems (NLDB 2012) },
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2013.07.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X1300075X},
  author = {Cäcilia Zirn and Heiner Stuckenschmidt},
  keywords = {Topic models},
  keywords = {Political science},
  keywords = {Text Analysis },
  abstract = {Abstract Automatic content analysis is more and more becoming an accepted research method in social science. In political science researchers are using party manifestos and transcripts of political speeches to analyze the positions of different actors. Existing approaches are limited to a single dimension, in particular, they cannot distinguish between the positions with respect to a specific topic. In this paper, we propose a method for analyzing and comparing documents according to a set of predefined topics that is based on an extension of Latent Dirichlet Allocation (LDA) for inducing knowledge about relevant topics. We validate the method by showing that it can guess which member of a coalition was assigned a certain ministry based on a comparison of the parties' election manifestos with the coalition contract. We apply the method to German National Elections since 1990 and show that the use of our method consistently outperforms a baseline method that simulates manual annotation of individual sentences based on keywords and standard text comparison. In our experiments, we compare two different extensions of \{LDA\} and investigate the influence of the used seed set. Finally, we give a brief illustration of how the output of our method can be interpreted to compare positions towards specific topics across several parties. }
}
@article{Wang20061752,
  title = {Flexible online association rule mining based on multidimensional pattern relations },
  journal = {Information Sciences },
  volume = {176},
  number = {12},
  pages = {1752 - 1780},
  year = {2006},
  note = {},
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2005.05.005},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025505001635},
  author = {Ching-Yao Wang and Shian-Shyong Tseng and Tzung-Pei Hong},
  keywords = {Data mining},
  keywords = {Association rule},
  keywords = {Incremental mining},
  keywords = {Multidimensional mining},
  keywords = {Constraint-based mining},
  keywords = {Data warehouse },
  abstract = {Most incremental mining and online mining algorithms concentrate on finding association rules or patterns consistent with entire current sets of data. Users cannot easily obtain results from only interesting portion of data. This may prevent the usage of mining from online decision support for multidimensional data. To provide ad-hoc, query-driven, and online mining support, we first propose a relation called the multidimensional pattern relation to structurally and systematically store context and mining information for later analysis. Each tuple in the relation comes from an inserted dataset in the database. We then develop an online mining approach called three-phase online association rule mining (TOARM) based on this proposed multidimensional pattern relation to support online generation of association rules under multidimensional considerations. The \{TOARM\} approach consists of three phases during which final sets of patterns satisfying various mining requests are found. It first selects and integrates related mining information in the multidimensional pattern relation, and then if necessary, re-processes itemsets without sufficient information against the underlying datasets. Some implementation considerations for the algorithm are also stated in detail. Experiments on homogeneous and heterogeneous datasets were made and the results show the effectiveness of the proposed approach. }
}
@article{Joita2007146,
  title = {A catallactic market for data mining services },
  journal = {Future Generation Computer Systems },
  volume = {23},
  number = {1},
  pages = {146 - 153},
  year = {2007},
  note = {},
  issn = {0167-739X},
  doi = {http://dx.doi.org/10.1016/j.future.2006.06.007},
  url = {http://www.sciencedirect.com/science/article/pii/S0167739X06001312},
  author = {L. Joita and Omer F. Rana and Felix Freitag and Isaac Chao and Pablo Chacin and Leandro Navarro and Oscar Ardaiz},
  keywords = {Data mining},
  keywords = {Grid market},
  keywords = {Application },
  abstract = {We describe a Grid market for exchanging data mining services based on the catallactic market mechanism proposed by von Hayek. This market mechanism allows selection between multiple instances of services based on operations required in a data mining task (such as data migration, data pre-processing and subsequently data analysis). Catallaxy is a decentralized approach, based on a “free market” mechanism, and is particularly useful when the number of market participants is large or when conditions within the market often change. It is therefore particularly suitable in Grid and peer-2-peer systems. The approach assumes that the service provider and user are not co-located, and require multiple message exchanges to carry out a data mining task. A market of J48-based decision tree algorithm instances, each implemented as a Web service, is used to demonstrate our approach. We have validated the feasibility of building catallactic data mining grid applications, and implemented a proof-of-concept application (Cat-COVITE) mapped to a catallactic Grid middleware. }
}
@article{Nasraoui20061488,
  title = {A framework for mining evolving trends in Web data streams using dynamic learning and retrospective validation },
  journal = {Computer Networks },
  volume = {50},
  number = {10},
  pages = {1488 - 1512},
  year = {2006},
  note = {I. Web Dynamics II. Algorithms for Distributed Systems },
  issn = {1389-1286},
  doi = {http://dx.doi.org/10.1016/j.comnet.2005.10.021},
  url = {http://www.sciencedirect.com/science/article/pii/S1389128605003671},
  author = {Olfa Nasraoui and Carlos Rojas and Cesar Cardona},
  keywords = {Mining evolving data streams},
  keywords = {Web clickstreams},
  keywords = {Web mining},
  keywords = {Text mining},
  keywords = {User profiles },
  abstract = {The expanding and dynamic nature of the Web poses enormous challenges to most data mining techniques that try to extract patterns from Web data, such as Web usage and Web content. While scalable data mining methods are expected to cope with the size challenge, coping with evolving trends in noisy data in a continuous fashion, and without any unnecessary stoppages and reconfigurations is still an open challenge. This dynamic and single pass setting can be cast within the framework of mining evolving data streams. The harsh restrictions imposed by the “you only get to see it once” constraint on stream data calls for different computational models that may furthermore bring some interesting surprises when it comes to the behavior of some well known similarity measures during clustering, and even validation. In this paper, we study the effect of similarity measures on the mining process and on the interpretation of the mined patterns in the harsh single pass requirement scenario. We propose a simple similarity measure that has the advantage of explicitly coupling the precision and coverage criteria to the early learning stages. Even though the cosine similarity, and its close relative such as the Jaccard measure, have been prevalent in the majority of Web data clustering approaches, they may fail to explicitly seek profiles that achieve high coverage and high precision simultaneously. We also formulate a validation strategy and adapt several metrics rooted in information retrieval to the challenging task of validating a learned stream synopsis in dynamic environments. Our experiments confirm that the performance of the MinPC similarity is generally better than the cosine similarity, and that this outperformance can be expected to be more pronounced for data sets that are more challenging in terms of the amount of noise and/or overlap, and in terms of the level of change in the underlying profiles/topics (known sub-categories of the input data) as the input stream unravels. In our simulations, we study the task of mining and tracking trends and profiles in evolving text and Web usage data streams in a single pass, and under different trend sequencing scenarios. }
}
@article{Gajendran20071378,
  title = {An application of bioinformatics and text mining to the discovery of novel genes related to bone biology },
  journal = {Bone },
  volume = {40},
  number = {5},
  pages = {1378 - 1388},
  year = {2007},
  note = {},
  issn = {8756-3282},
  doi = {http://dx.doi.org/10.1016/j.bone.2006.12.067},
  url = {http://www.sciencedirect.com/science/article/pii/S8756328206009471},
  author = {Varun K. Gajendran and Jia-Ren Lin and David P. Fyhrie},
  keywords = {Text mining},
  keywords = {Osteoporosis},
  keywords = {Bone apoptosis},
  keywords = {Bisphosphonate},
  keywords = {Osteocyte },
  abstract = {The treatment and management of complex genetic diseases such as osteoporosis can greatly benefit from the integration of relevant research across many different disciplines. We created a text mining tool that analyzes the PubMed literature database and integrates the available genomic information to provide a detailed mapping of the genes and their interrelationships within a particular network such as osteoporosis. The results obtained from our text mining program show that existing genomic data within the PubMed database can effectively be used to predict potentially novel target genes for osteoporosis research that have not previously been reported in the literature. To filter the most significant findings, we developed a ranking system to rate our predicted novel genes. Some of our predicted genes ranked higher than those currently studied, suggesting that they may be of particular interest from a therapeutic standpoint. A preliminary analysis of the current biomedical literature in our research area using our tool suggests that S100A12, as well as a group of \{SMAD\} genes previously unstudied in relation to osteoporosis, may be highly relevant to the mechanism of action of bisphosphonates, that the function of osteocytes may be influenced by a family of important interleukins and interleukin-related molecules, and that the \{FYN\} oncogene may play an important role in regulating the apoptosis of bone cells in the context of degenerative bone diseases. An evaluation of our tool's predictive ability with an analysis of PubMed literature published before the year 2000 in the area of osteoporosis research shows that many of its top-rated novel target genes from that analysis were later studied and shown to be relevant to osteoporosis in the period between 2000 and 2006. We believe that our tool will be beneficial to researchers in the field of orthopaedics seeking to identify novel target genes in their research area, and it will allow them to delve deeper into the complex interplay between genes, biological systems and diseases. }
}
@article{Zhong2007490,
  title = {Privacy-preserving algorithms for distributed mining of frequent itemsets },
  journal = {Information Sciences },
  volume = {177},
  number = {2},
  pages = {490 - 503},
  year = {2007},
  note = {},
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2006.08.010},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025506002441},
  author = {Sheng Zhong},
  keywords = {Data mining},
  keywords = {Association rules},
  keywords = {Distributed databases},
  keywords = {Privacy },
  abstract = {Standard algorithms for association rule mining are based on identification of frequent itemsets. In this paper, we study how to maintain privacy in distributed mining of frequent itemsets. That is, we study how two (or more) parties can find frequent itemsets in a distributed database without revealing each party’s portion of the data to the other. The existing solution for vertically partitioned data leaks a significant amount of information, while the existing solution for horizontally partitioned data only works for three parties or more. In this paper, we design algorithms for both vertically and horizontally partitioned data, with cryptographically strong privacy. We give two algorithms for vertically partitioned data; one of them reveals only the support count and the other reveals nothing. Both of them have computational overheads linear in the number of transactions. Our algorithm for horizontally partitioned data works for two parties and above and is more efficient than the existing solution. }
}
@article{Stumme2006124,
  title = {Semantic Web Mining: State of the art and future directions },
  journal = {Web Semantics: Science, Services and Agents on the World Wide Web },
  volume = {4},
  number = {2},
  pages = {124 - 143},
  year = {2006},
  note = {Semantic Grid --The Convergence of Technologies },
  issn = {1570-8268},
  doi = {http://dx.doi.org/10.1016/j.websem.2006.02.001},
  url = {http://www.sciencedirect.com/science/article/pii/S1570826806000084},
  author = {Gerd Stumme and Andreas Hotho and Bettina Berendt},
  keywords = {Web Mining},
  keywords = {Semantic Web},
  keywords = {Ontologies},
  keywords = {Knowledge discovery},
  keywords = {Knowledge engineering},
  keywords = {Artificial intelligence},
  keywords = {World Wide Web },
  abstract = {Semantic Web Mining aims at combining the two fast-developing research areas Semantic Web and Web Mining. This survey analyzes the convergence of trends from both areas: More and more researchers are working on improving the results of Web Mining by exploiting semantic structures in the Web, and they make use of Web Mining techniques for building the Semantic Web. Last but not least, these techniques can be used for mining the Semantic Web itself. The Semantic Web is the second-generation WWW, enriched by machine-processable information which supports the user in his tasks. Given the enormous size even of today’s Web, it is impossible to manually enrich all of these resources. Therefore, automated schemes for learning the relevant information are increasingly being used. Web Mining aims at discovering insights about the meaning of Web resources and their usage. Given the primarily syntactical nature of the data being mined, the discovery of meaning is impossible based on these data only. Therefore, formalizations of the semantics of Web sites and navigation behavior are becoming more and more common. Furthermore, mining the Semantic Web itself is another upcoming application. We argue that the two areas Web Mining and Semantic Web need each other to fulfill their goals, but that the full potential of this convergence is not yet realized. This paper gives an overview of where the two areas meet today, and sketches ways of how a closer integration could be profitable. }
}
@article{Damle2007305,
  title = {Flood prediction using Time Series Data Mining },
  journal = {Journal of Hydrology },
  volume = {333},
  number = {2–4},
  pages = {305 - 316},
  year = {2007},
  note = {},
  issn = {0022-1694},
  doi = {http://dx.doi.org/10.1016/j.jhydrol.2006.09.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0022169406004331},
  author = {Chaitanya Damle and Ali Yalcin},
  keywords = {River flood forecasting},
  keywords = {Time Series Data Mining},
  keywords = {Chaotic systems},
  keywords = {Event prediction },
  abstract = {Summary This paper describes a novel approach to river flood prediction using Time Series Data Mining which combines chaos theory and data mining to characterize and predict events in complex, nonperiodic and chaotic time series. Geophysical phenomena, including earthquakes, floods and rainfall, represent a class of nonlinear systems termed chaotic, in which the relationships between variables in a system are dynamic and disproportionate, however completely deterministic. Chaos theory provides a structured explanation for irregular behavior and anomalies in systems that are not inherently stochastic. While nonlinear approaches such as Artificial Neural Networks, Hidden Markov Models and Nonlinear Prediction are useful in forecasting of daily discharge values in a river, the focus of these approaches is on forecasting magnitudes of future discharge values rather than the prediction of floods. The described Time Series Data Mining methodology focuses on the prediction of events where floods constitute the events in a river daily discharge time series. The methodology is demonstrated using data collected at the St. Louis gauging station located on the Mississippi River in the USA. Results associated with the impact of earliness of prediction and the acceptable risk-level vs. prediction accuracy are presented. }
}
@article{Singh2007131,
  title = {Text mining a decade of progress in hospitality human resource management research: Identifying emerging thematic development },
  journal = {International Journal of Hospitality Management },
  volume = {26},
  number = {1},
  pages = {131 - 147},
  year = {2007},
  note = {},
  issn = {0278-4319},
  doi = {http://dx.doi.org/10.1016/j.ijhm.2005.10.002},
  url = {http://www.sciencedirect.com/science/article/pii/S027843190500099X},
  author = {Neha Singh and Clark Hu and Wesley S. Roehl},
  keywords = {Conceptual graphs},
  keywords = {Content analysis},
  keywords = {Hospitality research},
  keywords = {Human resource management (HRM)},
  keywords = {Text mining },
  abstract = {The authors identified the emerging research streams based on the published research literature in human resource management (HRM) from 1994 to 2003 in the International Journal of Hospitality Management. Textual data were collected and content-analyzed by a text-mining program aided by human judgments. The results from the content analysis of both the computer-aided and human judgmental methods were then integrated and conceptually graphed to map meaningful findings that were logically precise, humanly readable, and computationally tractable. Through this unique approach, nine major \{HRM\} research themes emerged and each thematic development based on time and country was interpreted and discussed. }
}
@article{Tobarra2014659,
  title = {Analyzing the students’ behavior and relevant topics in virtual learning communities },
  journal = {Computers in Human Behavior },
  volume = {31},
  number = {0},
  pages = {659 - 669},
  year = {2014},
  note = {},
  issn = {0747-5632},
  doi = {http://dx.doi.org/10.1016/j.chb.2013.10.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0747563213003518},
  author = {Llanos Tobarra and Antonio Robles-Gómez and Salvador Ros and Roberto Hernández and Agustín C. Caminero},
  keywords = {Learning analytics},
  keywords = {Students’ behavior},
  keywords = {Topic characterization},
  keywords = {Virtual communities },
  abstract = {Abstract The constant development of new Internet platforms is shifting the users’ role of such platforms, from viewers to main actors. In the field of education, faculty can take advantage of these new technologies for the design of pedagogical contents. The face-to-face observation of behavioral patterns allows faculty to detect and track new problems, and to apply possible corrections which would improve the learning/teaching process. However, with a distance methodology, these observations are not possible. When forums are created they are intended to discuss particular topics. It is relevant to monitor that the topics discussed are the intended ones in order to achieve course objectives. To tackle this shortcoming, our work studies the dynamics of relevant topics in on-line asynchronous discussion forums, and this is done by analyzing the large amount of students’ interactions generated in the forums of our Learning Management System (LMS). In particular, we analyze the students’ behavior patterns in the forums of a distance subject, and characterize the relevant topics and subtopics from the forums’ messages belonging to two academic years. From the statistical and graphical results obtained, a set of valuable recommendations are also given. }
}
@article{Hong2007223,
  title = {Mining fuzzy β-certain and β-possible rules from quantitative data based on the variable precision rough-set model },
  journal = {Expert Systems with Applications },
  volume = {32},
  number = {1},
  pages = {223 - 232},
  year = {2007},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2005.11.009},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417405003258},
  author = {Tzung-Pei Hong and Tzu-Ting Wang and Shyue-Liang Wang},
  keywords = {Fuzzy set},
  keywords = {Rough set},
  keywords = {Data mining},
  keywords = {Certain rule},
  keywords = {Possible rule},
  keywords = {Quantitative value },
  abstract = {The rough-set theory proposed by Pawlak, has been widely used in dealing with data classification problems. The original rough-set model is, however, quite sensitive to noisy data. Ziarko thus proposed the variable precision rough-set model to deal with noisy data and uncertain information. This model allowed for some degree of uncertainty and misclassification in the mining process. Conventionally, the mining algorithms based on the rough-set theory identify the relationships among data using crisp attribute values; however, data with quantitative values are commonly seen in real-world applications. This paper thus deals with the problem of producing a set of fuzzy certain and fuzzy possible rules from quantitative data with a predefined tolerance degree of uncertainty and misclassification. A new method, which combines the variable precision rough-set model and the fuzzy set theory, is thus proposed to solve this problem. It first transforms each quantitative value into a fuzzy set of linguistic terms using membership functions and then calculates the fuzzy β-lower and the fuzzy β-upper approximations. The certain and possible rules are then generated based on these fuzzy approximations. These rules can then be used to classify unknown objects. The paper thus extends the existing rough-set mining approaches to process quantitative data with tolerance of noise and uncertainty. }
}
@article{Lin2013202,
  title = {Mining stable patterns in multiple correlated databases },
  journal = {Decision Support Systems },
  volume = {56},
  number = {0},
  pages = {202 - 210},
  year = {2013},
  note = {},
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/j.dss.2013.06.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0167923613001747},
  author = {Yaojin Lin and Xuegang Hu and Xiaomei Li and Xindong Wu},
  keywords = {Multiple correlated databases},
  keywords = {Stable patterns},
  keywords = {Hierarchical clustering},
  keywords = {Gray relational analysis },
  abstract = {Abstract Many kinds of patterns (e.g., association rules, negative association rules, sequential patterns, and temporal patterns) have been studied for various applications, but very little work has been reported on multiple correlated databases that are all relevant. This paper proposes an efficient method for mining stable patterns from multiple correlated databases. First, we define the notion of stable items according to two constraint conditions, minsupp and varivalue. We then measure the similarity between stable items based on gray relational analysis, and present a hierarchical gray clustering method for mining stable patterns consisting of stable items. Finally, experiments are conducted on four datasets, and the results of the experiments show that our method is useful and efficient. }
}
@article{Nicholson2006785,
  title = {The basis for bibliomining: Frameworks for bringing together usage-based data mining and bibliometrics through data warehousing in digital library services },
  journal = {Information Processing & Management },
  volume = {42},
  number = {3},
  pages = {785 - 804},
  year = {2006},
  note = {},
  issn = {0306-4573},
  doi = {http://dx.doi.org/10.1016/j.ipm.2005.05.008},
  url = {http://www.sciencedirect.com/science/article/pii/S0306457305000658},
  author = {Scott Nicholson},
  keywords = {Data mining},
  keywords = {Data warehousing},
  keywords = {Digital libraries},
  keywords = {Bibliomining},
  keywords = {Evaluation},
  keywords = {Theory},
  keywords = {Library measurement},
  keywords = {Library evaluation },
  abstract = {Over the past few years, data mining has moved from corporations to other organizations. This paper looks at the integration of data mining in digital library services. First, bibliomining, or the combination of bibliometrics and data mining techniques to understand library services, is defined and the concept explored. Second, the conceptual frameworks for bibliomining from the viewpoint of the library decision-maker and the library researcher are presented and compared. Finally, a research agenda to resolve many of the common bibliomining issues and to move the field forward in a mindful manner is developed. The result is not only a roadmap for understanding the integration of data mining in digital library services, but also a template for other cross-discipline data mining researchers to follow for systematic exploration in their own subject domains. }
}
@article{Kwok2007406,
  title = {Employing web mining and data fusion to improve weak ad hoc retrieval },
  journal = {Information Processing & Management },
  volume = {43},
  number = {2},
  pages = {406 - 419},
  year = {2007},
  note = {Special issue on AIRS2005: Information Retrieval Research in Asia },
  issn = {0306-4573},
  doi = {http://dx.doi.org/10.1016/j.ipm.2006.07.008},
  url = {http://www.sciencedirect.com/science/article/pii/S0306457306001245},
  author = {Kui-Lam Kwok and Laszlo Grunfeld and Peter Deng},
  keywords = {Weak query},
  keywords = {Robust retrieval},
  keywords = {Salient term selection},
  keywords = {Web mining},
  keywords = {Alternate queries},
  keywords = {Data fusion },
  abstract = {When a user issues a reasonable query to a retrieval system and obtains no relevant documents, he or she is bound to feel frustrated. We call these weak queries and retrievals. Improving their effectiveness is an important issue for ad hoc retrieval and would be most rewarding for these users. We explain why data fusion of sufficiently dissimilar retrieval lists can improve weak query results and confirm this with experiments using short and medium size queries. To realize sufficiently dissimilar retrieval lists, we propose composing alternate queries through web search and mining, employ them for target retrieval, and combine with the original query retrieval list. Methods of forming web probes from longer queries, including salient term selection and query text window rotation, are investigated. When compared with normal ad hoc retrieval, web assistance and data fusion can more than double the original weak query effectiveness. Other queries can also improve along with weak ones. }
}
@article{Wołk2014126,
  title = {Building Subject-aligned Comparable Corpora and Mining it for Truly Parallel Sentence Pairs },
  journal = {Procedia Technology },
  volume = {18},
  number = {0},
  pages = {126 - 132},
  year = {2014},
  note = {International workshop on Innovations in Information and Communication Science and Technology, \{IICST\} 2014, 3-5 September 2014, Warsaw, Poland },
  issn = {2212-0173},
  doi = {http://dx.doi.org/10.1016/j.protcy.2014.11.024},
  url = {http://www.sciencedirect.com/science/article/pii/S2212017314005453},
  author = {Krzysztof Wołk and Krzysztof Marasek},
  keywords = {Comparable corpora},
  keywords = {machine translation},
  keywords = {\{NLP\} },
  abstract = {Abstract Parallel sentences are a relatively scarce but extremely useful resource for many applications including cross-lingual retrieval and statistical machine translation. This research explores our methodology for mining such data from previously obtained comparable corpora. The task is highly practical since non-parallel multilingual data exist in far greater quantities than parallel corpora, but parallel sentences are a much more useful resource. Here we propose a web crawling method for building subject-aligned comparable corpora from Wikipedia articles. We also introduce a method for extracting truly parallel sentences that are filtered out from noisy or just comparable sentence pairs. We describe our implementation of a specialized tool for this task as well as training and adaption of a machine translation system that supplies our filter with additional information about the similarity of comparable sentence pairs. }
}
@incollection{Shi2014320,
  title = {Chapter 10 - A Practical Software System of Data Mining and Knowledge Discovery for Geosciences },
  editor = {Shi, Guangren },
  booktitle = {Data Mining and Knowledge Discovery for Geoscientists },
  publisher = {Elsevier},
  edition = {},
  address = {Oxford},
  year = {2014},
  pages = {320 - 340},
  isbn = {978-0-12-410437-2},
  doi = {http://dx.doi.org/10.1016/B978-0-12-410437-2.00010-2},
  url = {http://www.sciencedirect.com/science/article/pii/B9780124104372000102},
  author = {Guangren Shi},
  keywords = {software system},
  keywords = {system components},
  keywords = {data input},
  keywords = {data preprocessing},
  keywords = {algorithm selection},
  keywords = {algorithms running},
  keywords = {results output},
  keywords = {regression algorithms},
  keywords = {classification algorithms},
  keywords = {oil layer classification },
  abstract = {Abstract This chapter has presented a practical software system of data mining and knowledge discovery for geosciences. This system consists of five modules: data input, data preprocessing, algorithm selection, running the selected algorithms, and results output. Minable subsurface data is described at first. In the module of data preprocessing, Q-mode cluster analysis is recommended to serve as a pioneering sample-reduction tool, while MRA, \{BAYSD\} and R-mode cluster analysis are recommended to serve as pioneering dimension-reduction tools. In the module of algorithm selection, only R-SVM and \{BPNN\} are taken as regression algorithm, while only C-SVM and \{BAYSD\} are taken as classification algorithm. To check out the feasibility and validity of the automatic selection of regression and classification algorithms in this system, three typical case studies are provided. The selected algorithm in the three typical case studies is C-SVM or BAYSD, C-SVM or BAYSD, and C-SVM, respectively. Finally, ten exercises are provided. }
}
@article{Rinaldi2007127,
  title = {Mining of relations between proteins over biomedical scientific literature using a deep-linguistic approach },
  journal = {Artificial Intelligence in Medicine },
  volume = {39},
  number = {2},
  pages = {127 - 136},
  year = {2007},
  note = {Artificial Intelligence in Medicine \{AIME\} '05 },
  issn = {0933-3657},
  doi = {http://dx.doi.org/10.1016/j.artmed.2006.08.005},
  url = {http://www.sciencedirect.com/science/article/pii/S0933365706001370},
  author = {Fabio Rinaldi and Gerold Schneider and Kaarel Kaljurand and Michael Hess and Christos Andronis and Ourania Konstandi and Andreas Persidis},
  keywords = {Information extraction},
  keywords = {Text mining},
  keywords = {Dependency parsing},
  keywords = {Biomedical literature},
  keywords = {Protein interactions },
  abstract = {SummaryObjective The amount of new discoveries (as published in the scientific literature) in the biomedical area is growing at an exponential rate. This growth makes it very difficult to filter the most relevant results, and thus the extraction of the core information becomes very expensive. Therefore, there is a growing interest in text processing approaches that can deliver selected information from scientific publications, which can limit the amount of human intervention normally needed to gather those results. Materials and methods This paper presents and evaluates an approach aimed at automating the process of extracting functional relations (e.g. interactions between genes and proteins) from scientific literature in the biomedical domain. The approach, using a novel dependency-based parser, is based on a complete syntactic analysis of the corpus. Results We have implemented a state-of-the-art text mining system for biomedical literature, based on a deep-linguistic, full-parsing approach. The results are validated on two different corpora: the manually annotated genomics information access (GENIA) corpus and the automatically annotated arabidopsis thaliana circadian rhythms (ATCR) corpus. Conclusion We show how a deep-linguistic approach (contrary to common belief) can be used in a real world text mining application, offering high-precision relation extraction, while at the same time retaining a sufficient recall. }
}
@article{Farid20135895,
  title = {An adaptive ensemble classifier for mining concept drifting data streams },
  journal = {Expert Systems with Applications },
  volume = {40},
  number = {15},
  pages = {5895 - 5906},
  year = {2013},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2013.05.001},
  url = {http://www.sciencedirect.com/science/article/pii/S095741741300287X},
  author = {Dewan Md. Farid and Li Zhang and Alamgir Hossain and Chowdhury Mofizur Rahman and Rebecca Strachan and Graham Sexton and Keshav Dahal},
  keywords = {Adaptive ensembles},
  keywords = {Concept drift},
  keywords = {Clustering},
  keywords = {Data streams},
  keywords = {Decision trees},
  keywords = {Novel classes },
  abstract = {Abstract It is challenging to use traditional data mining techniques to deal with real-time data stream classifications. Existing mining classifiers need to be updated frequently to adapt to the changes in data streams. To address this issue, in this paper we propose an adaptive ensemble approach for classification and novel class detection in concept drifting data streams. The proposed approach uses traditional mining classifiers and updates the ensemble model automatically so that it represents the most recent concepts in data streams. For novel class detection we consider the idea that data points belonging to the same class should be closer to each other and should be far apart from the data points belonging to other classes. If a data point is well separated from the existing data clusters, it is identified as a novel class instance. We tested the performance of this proposed stream classification model against that of existing mining algorithms using real benchmark datasets from \{UCI\} (University of California, Irvine) machine learning repository. The experimental results prove that our approach shows great flexibility and robustness in novel class detection in concept drifting and outperforms traditional classification models in challenging real-life data stream applications. }
}
@article{Christmann2007347,
  title = {Robust learning from bites for data mining },
  journal = {Computational Statistics & Data Analysis },
  volume = {52},
  number = {1},
  pages = {347 - 361},
  year = {2007},
  note = {},
  issn = {0167-9473},
  doi = {http://dx.doi.org/10.1016/j.csda.2006.12.009},
  url = {http://www.sciencedirect.com/science/article/pii/S0167947306004853},
  author = {Andreas Christmann and Ingo Steinwart and Mia Hubert},
  keywords = {Breakdown point},
  keywords = {Convex risk minimization},
  keywords = {Data mining},
  keywords = {Distributed computing},
  keywords = {Influence function},
  keywords = {Logistic regression},
  keywords = {Robustness},
  keywords = {Scalability},
  keywords = {Statistical machine learning},
  keywords = {Support vector machine },
  abstract = {Some methods from statistical machine learning and from robust statistics have two drawbacks. Firstly, they are computer-intensive such that they can hardly be used for massive data sets, say with millions of data points. Secondly, robust and non-parametric confidence intervals for the predictions according to the fitted models are often unknown. A simple but general method is proposed to overcome these problems in the context of huge data sets. An implementation of the method is scalable to the memory of the computer and can be distributed on several processors to reduce the computation time. The method offers distribution-free confidence intervals for the median of the predictions. The main focus is on general support vector machines (SVM) based on minimizing regularized risks. As an example, a combination of two methods from modern statistical machine learning, i.e. kernel logistic regression and ε -support vector regression, is used to model a data set from several insurance companies. The approach can also be helpful to fit robust estimators in parametric models for huge data sets. }
}
@article{Miotto20131145,
  title = {Unsupervised mining of frequent tags for clinical eligibility text indexing },
  journal = {Journal of Biomedical Informatics },
  volume = {46},
  number = {6},
  pages = {1145 - 1151},
  year = {2013},
  note = {Special Section: Social Media Environments },
  issn = {1532-0464},
  doi = {http://dx.doi.org/10.1016/j.jbi.2013.08.012},
  url = {http://www.sciencedirect.com/science/article/pii/S1532046413001408},
  author = {Riccardo Miotto and Chunhua Weng},
  keywords = {Information storage and retrieval},
  keywords = {Clinical trials},
  keywords = {Tags},
  keywords = {Information filtering},
  keywords = {Eligibility criteria},
  keywords = {Controlled vocabulary },
  abstract = {Abstract Clinical text, such as clinical trial eligibility criteria, is largely underused in state-of-the-art medical search engines due to difficulties of accurate parsing. This paper proposes a novel methodology to derive a semantic index for clinical eligibility documents based on a controlled vocabulary of frequent tags, which are automatically mined from the text. We applied this method to eligibility criteria on ClinicalTrials.gov and report that frequent tags (1) define an effective and efficient index of clinical trials and (2) are unlikely to grow radically when the repository increases. We proposed to apply the semantic index to filter clinical trial search results and we concluded that frequent tags reduce the result space more efficiently than an uncontrolled set of \{UMLS\} concepts. Overall, unsupervised mining of frequent tags from clinical text leads to an effective semantic index for the clinical eligibility documents and promotes their computational reuse. }
}
@article{Yoon2013122,
  title = {A Practical Approach for Content Mining of Tweets },
  journal = {American Journal of Preventive Medicine },
  volume = {45},
  number = {1},
  pages = {122 - 129},
  year = {2013},
  note = {},
  issn = {0749-3797},
  doi = {http://dx.doi.org/10.1016/j.amepre.2013.02.025},
  url = {http://www.sciencedirect.com/science/article/pii/S0749379713002432},
  author = {Sunmoo Yoon and Noémie Elhadad and Suzanne Bakken},
  abstract = {Abstract Use of data generated through social media for health studies is gradually increasing. Twitter is a short-text message system developed 6 years ago, now with more than 100 million users generating over 300 million Tweets every day. Twitter may be used to gain real-world insights to promote healthy behaviors. The purposes of this paper are to describe a practical approach to analyzing Tweet contents and to illustrate an application of the approach to the topic of physical activity. The approach includes five steps: (1) selecting keywords to gather an initial set of Tweets to analyze; (2) importing data; (3) preparing data; (4) analyzing data (topic, sentiment, and ecologic context); and (5) interpreting data. The steps are implemented using tools that are publically available and free of charge and designed for use by researchers with limited programming skills. Content mining of Tweets can contribute to addressing challenges in health behavior research. }
}
@article{Jian2014914,
  title = {How did national life expectation related to school years in developing countries—An approach using panel data mining },
  journal = {Computer Methods and Programs in Biomedicine },
  volume = {113},
  number = {3},
  pages = {914 - 918},
  year = {2014},
  note = {},
  issn = {0169-2607},
  doi = {http://dx.doi.org/10.1016/j.cmpb.2013.11.016},
  url = {http://www.sciencedirect.com/science/article/pii/S016926071300388X},
  author = {Wen-Shan Jian and Chen-Ling Huang and Usman Iqbal and Phung-Anh Nguyen and George Hsiao and Hsien-Chang Li},
  keywords = {Panel data analysis},
  keywords = {Life expectancy},
  keywords = {Education},
  keywords = {\{OECD\} },
  abstract = {AbstractBackground The purpose of the study was to probe into the changes in life expectancy associated with schooling years found by the Organization for Economic Co-operation and Development (OECD). Methods The study was based on the \{OECD\} database from the period 2000 to 2006. The data of thirty countries were constructed to allow comparisons over time and across these countries. Panel data analysis was used to estimate the relationship of national education, as defined as school years, with life expectancy. The control factors considered were numbers of practicing physicians, practicing nurses, hospital beds, and GDP. Results We used fixed effects of both country and time through linear regression, the coefficient of school years in relation to life expectancy was statistically significant but negative. This finding is not in accord with the hypothesis that investing in human capital through education stimulates better health outcomes. Conclusion Within developing countries, educational attainment is no longer keeping the same pace with life expectancy as before. Therefore, we suggest that an effective education policy should cover diverse topics, for example, balancing economic growth and mental hygiene, to improve national life expectancy. }
}
@article{deMirandaSanto20061013,
  title = {Text mining as a valuable tool in foresight exercises: A study on nanotechnology },
  journal = {Technological Forecasting and Social Change },
  volume = {73},
  number = {8},
  pages = {1013 - 1027},
  year = {2006},
  note = {Tech Mining: Exploiting Science and Technology Information Resources Tech Mining: Exploiting Science and Technology Information Resources },
  issn = {0040-1625},
  doi = {http://dx.doi.org/10.1016/j.techfore.2006.05.020},
  url = {http://www.sciencedirect.com/science/article/pii/S004016250600117X},
  author = {Marcio de Miranda Santo and Gilda Massari Coelho and Dalci Maria dos Santos and Lélio Fellows Filho},
  keywords = {Foresight},
  keywords = {Science, technology and innovation},
  keywords = {Future studies},
  keywords = {Methods and techniques},
  keywords = {Text mining},
  keywords = {Nanotechnology },
  abstract = {Since its inception in 2001, the Center for Management and Strategic Studies (CGEE) has as its main activity the conduct of foresight studies in support of the decision making process related to the establishment of ST&I policies and activities in Brazil. The methodology used by the center combines quantitative and qualitative methods. Explicit and tacit knowledge is mobilized in the process of developing complementary or differentiated visions of the future. Most of the studies conducted by \{CGEE\} begin with data monitoring activities, making use of text mining techniques. One case study carried out by \{CGEE\} on the field of nanotechnology is presented. In this case, text mining was used at the first stage followed by qualitative techniques. Results were used to guide government agencies to fund nanotechnology R&D to help raise the competitiveness of several sectors of the Brazilian economy. }
}
@article{Dash2007185,
  title = {Mining for similarities in time series data using wavelet-based feature vectors and neural networks },
  journal = {Engineering Applications of Artificial Intelligence },
  volume = {20},
  number = {2},
  pages = {185 - 201},
  year = {2007},
  note = {Special Issue on Applications of Artificial Intelligence in Process Systems Engineering },
  issn = {0952-1976},
  doi = {http://dx.doi.org/10.1016/j.engappai.2006.06.018},
  url = {http://www.sciencedirect.com/science/article/pii/S0952197606001187},
  author = {P.K. Dash and Maya Nayak and M.R. Senapati and I.W.C. Lee},
  keywords = {Wavelet Transform},
  keywords = {S-transform},
  keywords = {Time series},
  keywords = {Similarity detection},
  keywords = {Classification},
  keywords = {Knowledge discovery},
  keywords = {Data mining },
  abstract = {This paper presents a comparison between different wavelet feature vectors for data mining of nonstationary time series that occurs in an electricity supply network. Three different wavelet algorithms are simulated and applied on nine classes of power signal time series, which primarily belongs to an important problem area called electric power quality. In contrast to the wavelet analysis, the paper presents a new approach called S-transform-based time frequency analysis in processing power quality disturbance data. Certain pertinent feature vectors are extracted using the well-known wavelet methods and the new approach using S-transform. Neural networks are then used to compute the classification accuracy of the feature vectors. Certain characteristics of the wavelet feature vectors are apparent from the results. Further in large data sets partitioning is done and similarities of pattern vectors present in different sections are determined. The approach is a general one and can be applied to pattern classification, similarity determination, and knowledge discovery in time varying data patterns occurring in many practical sciences and engineering problems. }
}
@article{Rhee20141,
  title = {“I know you are not, but if you were asking me”: On emergence of discourse markers of topic presentation from hypothetical questions },
  journal = {Journal of Pragmatics },
  volume = {60},
  number = {0},
  pages = {1 - 16},
  year = {2014},
  note = {},
  issn = {0378-2166},
  doi = {http://dx.doi.org/10.1016/j.pragma.2013.10.005},
  url = {http://www.sciencedirect.com/science/article/pii/S0378216613002701},
  author = {Seongha Rhee},
  keywords = {Grammaticalization},
  keywords = {Topic-presenter},
  keywords = {Rhetorical question},
  keywords = {Discourse strategy},
  keywords = {Feigned interactivity},
  keywords = {Intersubjectification },
  abstract = {Abstract Korean has a number of grammatical devices to introduce topics into the discourse. Among such markers is a paradigm of periphrastic topic presenters that are built on rhetorical questions, bringing micro-level topics into the discourse. The major strategy involved in the development of these topic presenters is feigning interactivity, whereby the speaker rhetorically asks a hypothetical question on behalf of the addressee and then answers it. This rhetorical question strategy is an intriguing discourse manipulation to create an engaging effect in that what the speaker pursues from the addressee is not a verbal response, i.e. reply, but a cognitive response, i.e. attention. A historical investigation reveals a number of important implications in grammaticalization studies. For instance, the grammaticalization process of these innovative topic markers creates a template-like paradigm of periphrastic constructions that contain slots to be filled in from another grammatical paradigm of interrogative pronouns and adverbs. Furthermore, the historical developmental pattern of the emerging paradigm strongly suggests that the formative process is enabled by analogy. In addition, the directionality of the process is from the domain of discourse, i.e. rhetoric, contra most traditional instances that undergo the process proceeding from lexical domain to grammatical domain. }
}
@article{Moran2013733,
  title = {Using the multiple capitals framework to connect indicators of regional cumulative impacts of mining and pastoralism in the Murray Darling Basin, Australia },
  journal = {Resources Policy },
  volume = {38},
  number = {4},
  pages = {733 - 744},
  year = {2013},
  note = {},
  issn = {0301-4207},
  doi = {http://dx.doi.org/10.1016/j.resourpol.2013.01.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0301420713000032},
  author = {C.J. Moran and D.M. Franks and L.J. Sonter},
  keywords = {Cumulative effects assessment and management (CEAM)},
  keywords = {Cumulative impacts},
  keywords = {Complex systems},
  keywords = {Resource capitals},
  keywords = {Sustainability},
  keywords = {Strategic environmental assessment (SEA) },
  abstract = {It is commonly recognized that there are constraints to successful regional-scale assessment and monitoring of cumulative impacts because of challenges in the selection of coherent and measurable indicators of the effects. It has also been sensibly declared that the connections between components in a region are as important as the state of the elements themselves. These have previously been termed “linked” cumulative impacts/effects. These connections can be difficult to discern because of a complicated set of interactions and unexpected linkages. In this paper we diagnose that a significant cause of these constraints is the selection of indicators without due regard for their inter-relationships in the formulation of the indicator set. The paper examines whether the common “forms of capital”, i.e., natural (renewable and non-renewable), manufactured, social, human and financial capitals, framework is a potential organizing structure. We examine a large region in western \{NSW\} Australia where the predominant production systems are mining and grazing for production of wool, beef and lamb. Production in both is driven by consumption of a non-renewable resource, i.e., ore for mining and topsoil for grazing, the latter on the basis that loss rate estimates far exceed soil formation rates. We propose that the challenge of identifying connections of components within and between capital stores can be approached by explicitly separating stores of capital and the flows of capital between stores and between elements within stores, so-called capital fluxes. We attempt to acquire data from public sources for both capital stores and fluxes. The question of whether these data are a sufficient base for regional assessment, with particular reference to connections, is discussed. The well-described challenge of a comparative common currency for stores and fluxes is also discussed. We conclude that the data acquisition is relatively successful for stores and fluxes. A number of linked impacts are identified and discussed. The potential use of money as the common currency for stores and fluxes of capital is considered. The basic proposition is that replacement or preservation costs be used for this. We conclude that the study is sufficiently positive to consider further research in fully-coupled models of capital stores and fluxes. }
}
@article{Becker2006966,
  title = {Innovations in meta-analysis and social impact analysis relevant for tech mining },
  journal = {Technological Forecasting and Social Change },
  volume = {73},
  number = {8},
  pages = {966 - 980},
  year = {2006},
  note = {Tech Mining: Exploiting Science and Technology Information Resources Tech Mining: Exploiting Science and Technology Information Resources },
  issn = {0040-1625},
  doi = {http://dx.doi.org/10.1016/j.techfore.2006.01.008},
  url = {http://www.sciencedirect.com/science/article/pii/S0040162506001223},
  author = {Henk A. Becker and Karin Sanders},
  keywords = {Extended meta-analysis},
  keywords = {Information extraction},
  keywords = {Data mining},
  keywords = {Data warehouse},
  keywords = {Meta-analysis},
  keywords = {Non-utilization of knowledge},
  keywords = {Scenario-to-strategy workshop},
  keywords = {Social impact assessment},
  keywords = {Solidarity at work},
  keywords = {Technology generations},
  keywords = {Think tanks},
  keywords = {Value transfer },
  abstract = {In the social sciences, non-utilization of knowledge is a major problem. Many publications stored in libraries or available on the Internet should be used more than they are now. Conventional approaches like providing abstracts and lists of keywords have proven to be insufficient. For more than thirty years already, meta-analysis is available for the accumulation and dissemination of scientific knowledge. In the social sciences, meta-analysis has been used on a limited scale only, mainly because there still remains a gap between the knowledge available and its application in policymaking. Recently, value transfer has been introduced as an additional method to bridge the gap between available knowledge and the demands for knowledge in new problem areas. Not only in the social sciences but also in the information sciences non-utilization of information is a major problem. It is the mission of tech mining to contribute to a mitigation of this non-utilization. In this article, we will show how tech mining could profit from innovations in meta-analysis and social impact assessment. Special attention will be paid to research on technology generations, research on social change in cohesive social systems showing solidarity at work, and tech mining in support of the Lisbon Strategy of the European Commission. }
}
@article{Khrouf20143,
  title = {Mining events connections on the social web: Real-time instance matching and data analysis in EventMedia },
  journal = {Web Semantics: Science, Services and Agents on the World Wide Web },
  volume = {24},
  number = {0},
  pages = {3 - 10},
  year = {2014},
  note = {The Semantic Web Challenge 2012 },
  issn = {1570-8268},
  doi = {http://dx.doi.org/10.1016/j.websem.2014.02.003},
  url = {http://www.sciencedirect.com/science/article/pii/S1570826814000080},
  author = {Houda Khrouf and Vuk Milicic and Raphaël Troncy},
  keywords = {EventMedia},
  keywords = {\{LODE\} ontology},
  keywords = {Real-time},
  keywords = {Instance matching},
  keywords = {Event-based social network },
  abstract = {Abstract Event and media services have recently witnessed a rapid growth driving the way people explore information of interest. A significant amount of social calendars, media memes and background knowledge are daily created on various platforms, conveying event clues or past users experience. Mining, in real-time, the connection of these distributed data fragments provides a key advantage not only to deliver enriched views, but also to gain insight into interesting sociological aspects. To this aim, we harness the power of Semantic Web technologies as means to easily steer the data integration and analysis. Our overall goal is to build a web-based environment that allows users to discover meaningful, surprising or entertaining connections between events, media and people. In this paper, we present EventMedia, a platform that provides descriptions of events associated with media, and interlinked with the Linked Data cloud. It draws on a live data update and a real-time interlinking to face the natural dynamics of events. A user-friendly interface has been designed to meet the user needs: relive experiences based on media and support decision making for attending upcoming events. }
}
@article{Wang2006240,
  title = {Efficient mining of group patterns from user movement data },
  journal = {Data & Knowledge Engineering },
  volume = {57},
  number = {3},
  pages = {240 - 282},
  year = {2006},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2005.04.006},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X05000558},
  author = {Yida Wang and Ee-Peng Lim and San-Yih Hwang},
  keywords = {Group pattern mining},
  keywords = {Mobile data mining},
  keywords = {Location summarization },
  abstract = {In this paper, we present a new approach to derive groupings of mobile users based on their movement data. We assume that the user movement data are collected by logging location data emitted from mobile devices tracking users. We formally define group pattern as a group of users that are within a distance threshold from one another for at least a minimum duration. To mine group patterns, we first propose two algorithms, namely \{AGP\} and VG-growth. In our first set of experiments, it is shown when both the number of users and logging duration are large, \{AGP\} and VG-growth are inefficient for the mining group patterns of size two. We therefore propose a framework that summarizes user movement data before group pattern mining. In the second series of experiments, we show that the methods using location summarization reduce the mining overheads for group patterns of size two significantly. We conclude that the cuboid based summarization methods give better performance when the summarized database size is small compared to the original movement database. In addition, we also evaluate the impact of parameters on the mining overhead. }
}
@article{Knierzinger201420,
  title = {The socio-political implications of bauxite mining in Guinea: A commodity chain perspective },
  journal = {The Extractive Industries and Society },
  volume = {1},
  number = {1},
  pages = {20 - 27},
  year = {2014},
  note = {},
  issn = {2214-790X},
  doi = {http://dx.doi.org/10.1016/j.exis.2014.01.005},
  url = {http://www.sciencedirect.com/science/article/pii/S2214790X14000124},
  author = {Johannes Knierzinger},
  keywords = {Guinea},
  keywords = {Bauxite},
  keywords = {Aluminum},
  keywords = {Development},
  keywords = {Global commodity chains },
  abstract = {Abstract For more than 60 years, the everyday lives of Guineans have been shaped by the decisions of a few multinational companies engaged in aluminum production. This sector – the country's most important – is highly concentrated, vertically integrated, capital-intensive and strongly interconnected. Focusing on two historical “crises of chain governance” (the 1970s and 2000s) which temporarily increased the political clout of many resource rich countries, this article identifies the policy options facing Guinea's decision makers and describes the structural basis for corporate influence, namely infrastructural power, capital relations and corporate chains of command. To improve living conditions in Guinea, changes will be needed to be made throughout the commodity chain for aluminum, solutions which also involve consumer countries. }
}
@article{Costa2006366,
  title = {An interdisciplinary approach to integrating sustainability into mining engineering education and research },
  journal = {Journal of Cleaner Production },
  volume = {14},
  number = {3–4},
  pages = {366 - 373},
  year = {2006},
  note = {Improving Environmental, Economic and Ethical Performance in the Mining Industry. Part 1. Environmental Management and Sustainable Development Improving Environmental, Economic and Ethical Performance in the Mining Industry. Part 1. Environmental Management and Sustainable Development },
  issn = {0959-6526},
  doi = {http://dx.doi.org/10.1016/j.jclepro.2004.01.011},
  url = {http://www.sciencedirect.com/science/article/pii/S095965260500048X},
  author = {Silvana Costa and Malcolm Scoble},
  keywords = {Sustainability},
  keywords = {Education},
  keywords = {Interdisciplinary},
  keywords = {Mining},
  keywords = {Research},
  keywords = {University},
  keywords = {Innovation },
  abstract = {This paper considers how the Mining Engineering Department at the University of British Columbia (UBC) is addressing the need to integrate sustainable development into mining engineering on behalf of industry and society. In particular, it considers the evolution of an interdisciplinary model, the Sustainability Working Group (SWG), which has brought together a diverse array of disciplines from academia, industry, government, \{NGOs\} and mining communities. The operational mechanisms and structure of \{SWG\} have enabled momentum to be gained through several initiatives. These aim to impact on undergraduate and postgraduate education, lifelong learning and research. }
}
@article{Chau200757,
  title = {Mining communities and their relationships in blogs: A study of online hate groups },
  journal = {International Journal of Human-Computer Studies },
  volume = {65},
  number = {1},
  pages = {57 - 70},
  year = {2007},
  note = {Information security in the knowledge economy },
  issn = {1071-5819},
  doi = {http://dx.doi.org/10.1016/j.ijhcs.2006.08.009},
  url = {http://www.sciencedirect.com/science/article/pii/S1071581906001248},
  author = {Michael Chau and Jennifer Xu},
  keywords = {Blogs},
  keywords = {Social network analysis},
  keywords = {Hate groups},
  keywords = {Web mining },
  abstract = {Blogs, often treated as the equivalence of online personal diaries, have become one of the fastest growing types of Web-based media. Everyone is free to express their opinions and emotions very easily through blogs. In the blogosphere, many communities have emerged, which include hate groups and racists that are trying to share their ideology, express their views, or recruit new group members. It is important to analyze these virtual communities, defined based on membership and subscription linkages, in order to monitor for activities that are potentially harmful to society. While many Web mining and network analysis techniques have been used to analyze the content and structure of the Web sites of hate groups on the Internet, these techniques have not been applied to the study of hate groups in blogs. To address this issue, we have proposed a semi-automated approach in this research. The proposed approach consists of four modules, namely blog spider, information extraction, network analysis, and visualization. We applied this approach to identify and analyze a selected set of 28 anti-Blacks hate groups (820 bloggers) on Xanga, one of the most popular blog hosting sites. Our analysis results revealed some interesting demographical and topological characteristics in these groups, and identified at least two large communities on top of the smaller ones. The study also demonstrated the feasibility in applying the proposed approach in the study of hate groups and other related communities in blogs. }
}
@article{Kim2007398,
  title = {Personalized mining of web documents using link structures and fuzzy concept networks },
  journal = {Applied Soft Computing },
  volume = {7},
  number = {1},
  pages = {398 - 410},
  year = {2007},
  note = {},
  issn = {1568-4946},
  doi = {http://dx.doi.org/10.1016/j.asoc.2005.09.002},
  url = {http://www.sciencedirect.com/science/article/pii/S1568494605000815},
  author = {Kyung-Joong Kim and Sung-Bae Cho},
  keywords = {Web mining},
  keywords = {Fuzzy concept networks},
  keywords = {Personalization},
  keywords = {Link-based search},
  keywords = {Search engines },
  abstract = {Personalized search engines are important tools for finding web documents for specific users, because they are able to provide the location of information on the \{WWW\} as accurately as possible, using efficient methods of data mining and knowledge discovery. The types and features of traditional search engines are various, including support for different functionality and ranking methods. New search engines that use link structures have produced improved search results which can overcome the limitations of conventional text-based search engines. Going a step further, this paper presents a system that provides users with personalized results derived from a search engine that uses link structures. The fuzzy document retrieval system (constructed from a fuzzy concept network based on the user's profile) personalizes the results yielded from link-based search engines with the preferences of the specific user. A preliminary experiment with six subjects indicates that the developed system is capable of searching not only relevant but also personalized web pages, depending on the preferences of the user. }
}
@article{Montani201433,
  title = {Improving structural medical process comparison by exploiting domain knowledge and mined information },
  journal = {Artificial Intelligence in Medicine },
  volume = {62},
  number = {1},
  pages = {33 - 45},
  year = {2014},
  note = {},
  issn = {0933-3657},
  doi = {http://dx.doi.org/10.1016/j.artmed.2014.07.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0933365714000815},
  author = {Stefania Montani and Giorgio Leonardi and Silvana Quaglini and Anna Cavallini and Giuseppe Micieli},
  keywords = {Process mining and comparison},
  keywords = {Graph edit distance},
  keywords = {Stroke management },
  abstract = {AbstractObjectives Process model comparison and similar process retrieval is a key issue to be addressed in many real-world situations, and a particularly relevant one in medical applications, where similarity quantification can be exploited to accomplish goals such as conformance checking, local process adaptation analysis, and hospital ranking. In this paper, we present a framework that allows the user to: (i) mine the actual process model from a database of process execution traces available at a given hospital; and (ii) compare (mined) process models. The tool is currently being applied in stroke management. Methods Our framework relies on process mining to extract process-related information (i.e., process models) from data. As for process comparison, we have modified a state-of-the-art structural similarity metric by exploiting: (i) domain knowledge; (ii) process mining outputs and statistical temporal information. These changes were meant to make the metric more suited to the medical domain. Results Experimental results showed that our metric outperforms the original one, and generated output closer than that provided by a stroke management expert. In particular, our metric correctly rated 11 out of 15 mined hospital models with respect to a given query. On the other hand, the original metric correctly rated only 7 out of 15 models. The experiments also showed that the framework can support stroke management experts in answering key research questions: in particular, average patient improvement decreased as the distance (according to our metric) from the top level hospital process model increased. Conclusions The paper shows that process mining and process comparison, through a similarity metric tailored to medical applications, can be applied successfully to clinical data to gain a better understanding of different medical processes adopted by different hospitals, and of their impact on clinical outcomes. In the future, we plan to make our metric even more general and efficient, by explicitly considering various methodological and technological extensions. We will also test the framework in different domains. }
}
@article{CourseaultTrumbach2006937,
  title = {Technology mining for small firms: Knowledge prospecting for competitive advantage },
  journal = {Technological Forecasting and Social Change },
  volume = {73},
  number = {8},
  pages = {937 - 949},
  year = {2006},
  note = {Tech Mining: Exploiting Science and Technology Information Resources Tech Mining: Exploiting Science and Technology Information Resources },
  issn = {0040-1625},
  doi = {http://dx.doi.org/10.1016/j.techfore.2006.05.018},
  url = {http://www.sciencedirect.com/science/article/pii/S0040162506001144},
  author = {Cherie Courseault Trumbach and Dinah Payne and Alisa Kongthon},
  keywords = {Small business},
  keywords = {\{SME\}},
  keywords = {High tech},
  keywords = {Innovation},
  keywords = {Technology mining},
  keywords = {Information flow },
  abstract = {Successful small businesses effectively use outside information. High tech small firms are designed to be flexible and innovative. They are often built upon a successful innovative product. These organizations are challenged to remain innovative in a fast-paced competitive environment. However, they face constraints due to the costs associated with the development of continuous innovation. This paper presents technology mining as a method to aid small firms in remaining knowledgeable about innovative ideas. In doing so, the authors present findings from a small high tech company whose issues are typical of other small high tech firms. Areas for improvement and recommendations to address those areas are also presented for review and further reflection. }
}
@article{Ma20131,
  title = {Mining smart card data for transit riders’ travel patterns },
  journal = {Transportation Research Part C: Emerging Technologies },
  volume = {36},
  number = {0},
  pages = {1 - 12},
  year = {2013},
  note = {},
  issn = {0968-090X},
  doi = {http://dx.doi.org/10.1016/j.trc.2013.07.010},
  url = {http://www.sciencedirect.com/science/article/pii/S0968090X13001630},
  author = {Xiaolei Ma and Yao-Jan Wu and Yinhai Wang and Feng Chen and Jianfeng Liu},
  keywords = {Automatic Fare Collection System},
  keywords = {Smart card},
  keywords = {Transit travel pattern},
  keywords = {K-Means algorithm},
  keywords = {Rough set theory },
  abstract = {Abstract To mitigate the congestion caused by the ever increasing number of privately owned automobiles, public transit is highly promoted by transportation agencies worldwide. A better understanding of travel patterns and regularity at the “magnitude” level will enable transit authorities to evaluate the services they offer, adjust marketing strategies, retain loyal customers and improve overall transit performance. However, it is fairly challenging to identify travel patterns for individual transit riders in a large dataset. This paper proposes an efficient and effective data-mining procedure that models the travel patterns of transit riders in Beijing, China. Transit riders’ trip chains are identified based on the temporal and spatial characteristics of their smart card transaction data. The Density-based Spatial Clustering of Applications with Noise (DBSCAN) algorithm then analyzes the identified trip chains to detect transit riders’ historical travel patterns and the K-Means++ clustering algorithm and the rough-set theory are jointly applied to cluster and classify travel pattern regularities. The performance of the rough-set-based algorithm is compared with those of other prevailing classification algorithms. The results indicate that the proposed rough-set-based algorithm outperforms other commonly used data-mining algorithms in terms of accuracy and efficiency. }
}
@article{Grossman2006940,
  title = {Data mining middleware for wide-area high-performance networks },
  journal = {Future Generation Computer Systems },
  volume = {22},
  number = {8},
  pages = {940 - 948},
  year = {2006},
  note = {},
  issn = {0167-739X},
  doi = {http://dx.doi.org/10.1016/j.future.2006.03.024},
  url = {http://www.sciencedirect.com/science/article/pii/S0167739X06000392},
  author = {Robert L. Grossman and Yunhong Gu and David Hanley and Michal Sabala and Joe Mambretti and Alex Szalay and Ani Thakar and Kazumi Kumazoe and Oie Yuji and Minsun Lee and Yoonjoo Kwon and Woojin Seok},
  keywords = {High-performance network protocols},
  keywords = {High-performance networks},
  keywords = {High performance data mining},
  keywords = {Data mining middleware },
  abstract = {In this paper, we describe two distributed, data intensive applications that were demonstrated at iGrid 2005 (iGrid Demonstration \{US109\} and iGrid Demonstration US121). One involves transporting astronomical data from the Sloan Digital Sky Survey (SDSS) and the other involves computing histograms from multiple high-volume data streams. Both rely on newly developed data transport and data mining middleware. Specifically, we describe a new version of the \{UDT\} network protocol called Composible-UDT, a file transfer utility based upon \{UDT\} called UDT-Gateway, and an application for building histograms on high-volume data flows called \{BESH\} (for Best Effort Streaming Histogram). For both demonstrations, we include a summary of the experimental studies performed at iGrid 2005. }
}
@article{Facca2005225,
  title = {Mining interesting knowledge from weblogs: a survey },
  journal = {Data & Knowledge Engineering },
  volume = {53},
  number = {3},
  pages = {225 - 241},
  year = {2005},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2004.08.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X04001387},
  author = {Federico Michele Facca and Pier Luca Lanzi},
  keywords = {Machine learning},
  keywords = {Web Mining },
  abstract = {Web Usage Mining is that area of Web Mining which deals with the extraction of interesting knowledge from logging information produced by Web servers. In this paper we present a survey of the recent developments in this area that is receiving increasing attention from the Data Mining community. }
}
@article{Li2006835,
  title = {Inspection-oriented coding service based on machine learning and semantics mining },
  journal = {Expert Systems with Applications },
  volume = {31},
  number = {4},
  pages = {835 - 848},
  year = {2006},
  note = {Computer Supported Cooperative Work in Design and Manufacturing },
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2006.01.019},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417406000194},
  author = {Yinsheng Li and Zhanxin Ma and Wei Xie and Chris Laing},
  keywords = {\{HS\} codes},
  keywords = {Semantics mining},
  keywords = {Knowledge database},
  keywords = {Web services },
  abstract = {\{HS\} codes have been adopted by the majority of countries as being the basis for import and export inspection and the generation of trade statistics. Customs authorities and international traders need a \{HS\} code query tool to make their processing efficient and automatic. Since \{HS\} codes are identified at 5–7 levels of classification, then any intelligent coding service will need to combine a knowledge database, with the techniques of data mining, machine learning and semantics reasoning. In this paper, the authors propose a comprehensive solution for such a coding service. The architecture, related techniques, technical solution and implementation considerations for the proposed system have been provided. Several of the proposed functions and implementation techniques have been developed and deployed by the Shanghai International Airport Entry-Exit Inspection and Quarantine Bureau. The coding service has been published as a Web service, and has the potential to be widely used by authorities and international traders around the world. The proposed system may also be appropriate for other applications that relate to code or classification processes, such as RFID-based or product ontology based applications. }
}
@incollection{Black2006621,
  title = {Text Mining },
  editor = {Brown, Keith },
  booktitle = {Encyclopedia of Language & Linguistics (Second Edition) },
  publisher = {Elsevier},
  edition = {Second Edition},
  address = {Oxford},
  year = {2006},
  pages = {621 - 624},
  isbn = {978-0-08-044854-1},
  doi = {http://dx.doi.org/10.1016/B0-08-044854-2/00965-2},
  url = {http://www.sciencedirect.com/science/article/pii/B0080448542009652},
  author = {W. Black},
  keywords = {clustering},
  keywords = {data mining},
  keywords = {document warehousing},
  keywords = {information retrieval},
  keywords = {knowledge discovery},
  keywords = {natural language processing applications},
  keywords = {text mining },
  abstract = {Text mining is an application of natural language processing (NLP) to large document collections. The purpose is to ‘discover’ practical and useful knowledge from the text collection in business or scientific domains. The documents are generally managed in a document warehouse, and a variety of robust and efficient \{NLP\} and statistical techniques may be used to reveal patterns that are only apparent when multiple documents are processed. Among the achievements of text mining are new medical discoveries made by collating complementary scientific literatures. }
}
@article{ErteschikShir2013145,
  title = {Missing objects as Topic Drop },
  journal = {Lingua },
  volume = {136},
  number = {0},
  pages = {145 - 169},
  year = {2013},
  note = {SI: Information Structure Triggers Effects on (De)Accentuation, Dislocation and Discourse Linking },
  issn = {0024-3841},
  doi = {http://dx.doi.org/10.1016/j.lingua.2013.07.009},
  url = {http://www.sciencedirect.com/science/article/pii/S0024384113001654},
  author = {Nomi Erteschik-Shir and Lena Ibnbari and Sharon Taube},
  keywords = {Topic Drop},
  keywords = {\{VP\} ellipsis},
  keywords = {Parasitic Gaps},
  keywords = {Russian},
  keywords = {Hebrew},
  keywords = {Differential object marking},
  keywords = {Strict and sloppy readings},
  keywords = {Subject–object asymmetry },
  abstract = {Abstract In this paper we argue that missing objects are to be analyzed in Information Structure terms as missing ‘continued’ topics referred to here as Topic Drop. We employ two cases to argue for this claim. First we show that missing objects in Russian adverbial gerunds are to be analyzed as Topic Drop rather than as Parasitic Gaps. Similarly, Topic Drop is solely responsible for the missing object in Hebrew structures which have also received structural analyses involving VP-ellipsis or a moved null operator. We argue against movement analyses of both constructions and propose that missing objects are to be analyzed as unvalued feature bundles that receive an interpretation from an available topic in the discourse. }
}
@article{Shapira2006950,
  title = {Measures for knowledge-based economic development: Introducing data mining techniques to economic developers in the state of Georgia and the \{US\} South },
  journal = {Technological Forecasting and Social Change },
  volume = {73},
  number = {8},
  pages = {950 - 965},
  year = {2006},
  note = {Tech Mining: Exploiting Science and Technology Information Resources Tech Mining: Exploiting Science and Technology Information Resources },
  issn = {0040-1625},
  doi = {http://dx.doi.org/10.1016/j.techfore.2006.05.017},
  url = {http://www.sciencedirect.com/science/article/pii/S0040162506001132},
  author = {Philip Shapira and Jan Youtie},
  keywords = {Economic development},
  keywords = {Innovation},
  keywords = {Knowledge measurement},
  keywords = {Data mining},
  keywords = {Bibliometrics },
  abstract = {The contribution of knowledge to economic growth and competitiveness has attracted increased attention. Publications with a topical focus on areas related to innovation have risen dramatically from 1963 to 2005, but more slowly in local and regional development journals. In contrast to the wide use of aggregate measures of innovation, this paper presents four cases presenting disaggregated knowledge-based approaches into the policy- and decision-making processes of economic developers in the state of Georgia and the \{US\} South. The first case uses information obtained from patents and publications to inform traditional out-of-area economic development recruitment strategies in a more knowledge-oriented direction. The second case exemplifies the use of data mining to identify top researchers as part of a strategic state economic development effort. The third case illustrates how local knowledge-based capabilities can be identified in cities not traditionally viewed as innovative. Nanotechnology-related knowledge assets in the southern United States are mapped and assessed in the fourth case. Disaggregated methods used in traditional strategies were most intuitively understood and used, but new knowledge measures were found to encourage local and state economic developers to begin to embrace new paradigms. }
}
@article{Huang201439,
  title = {Discovery of clinical pathway patterns from event logs using probabilistic topic models },
  journal = {Journal of Biomedical Informatics },
  volume = {47},
  number = {0},
  pages = {39 - 57},
  year = {2014},
  note = {},
  issn = {1532-0464},
  doi = {http://dx.doi.org/10.1016/j.jbi.2013.09.003},
  url = {http://www.sciencedirect.com/science/article/pii/S1532046413001445},
  author = {Zhengxing Huang and Wei Dong and Lei Ji and Chenxi Gan and Xudong Lu and Huilong Duan},
  keywords = {Clinical pathway analysis},
  keywords = {Topic models},
  keywords = {Latent Dirichlet Allocation},
  keywords = {Pattern discovery},
  keywords = {Clinical event log },
  abstract = {Abstract Discovery of clinical pathway (CP) patterns has experienced increased attention over the years due to its importance for revealing the structure, semantics and dynamics of CPs, and to its usefulness for providing clinicians with explicit knowledge which can be directly used to guide treatment activities of individual patients. Generally, discovery of \{CP\} patterns is a challenging task as treatment behaviors in \{CPs\} often have a large variability depending on factors such as time, location and patient individual. Based on the assumption that \{CP\} patterns can be derived from clinical event logs which usually record various treatment activities in \{CP\} executions, this study proposes a novel approach to \{CP\} pattern discovery by modeling \{CPs\} using mixtures of an extension to the Latent Dirichlet Allocation family that jointly models various treatment activities and their occurring time stamps in CPs. Clinical case studies are performed to evaluate the proposed approach via real-world data sets recording typical treatment behaviors in patient careflow. The obtained results demonstrate the suitability of the proposed approach for \{CP\} pattern discovery, and indicate the promise in research efforts related to \{CP\} analysis and optimization. }
}
@article{Fu2015369,
  title = {Automated classification of software change messages by semi-supervised Latent Dirichlet Allocation },
  journal = {Information and Software Technology },
  volume = {57},
  number = {0},
  pages = {369 - 377},
  year = {2015},
  note = {},
  issn = {0950-5849},
  doi = {http://dx.doi.org/10.1016/j.infsof.2014.05.017},
  url = {http://www.sciencedirect.com/science/article/pii/S0950584914001347},
  author = {Ying Fu and Meng Yan and Xiaohong Zhang and Ling Xu and Dan Yang and Jeffrey D. Kymer},
  keywords = {Software repositories mining},
  keywords = {Semi-supervised topic modeling},
  keywords = {\{LDA\}},
  keywords = {Change message },
  abstract = {AbstractContext Topic models such as probabilistic Latent Semantic Analysis (pLSA) and Latent Dirichlet Allocation (LDA) have demonstrated success in mining software repository tasks. Understanding software change messages described by the unstructured nature-language text is one of the fundamental challenges in mining these messages in repositories. Objective We seek to present a novel automatic change message classification method characterized by semi-supervised topic semantic analysis. Method In this work, we present a semi-supervised \{LDA\} based approach to automatically classify change messages. We use domain knowledge of software changes to make labeled samples which are added to build the semi-supervised \{LDA\} model. Next, we verify the cross-project analysis application of our method on three open-source projects. Our method has two advantages over existing software change classification methods: First of all, it mitigates the issue of how to set the appropriate number of latent topics. We do not have to choose the number of latent topics in our method, because it corresponds to the number of class labels. Second, this approach utilizes the information provided by the label samples in the training set. Results Our method automatically classified about 85% of the change messages in our experiment and our validation survey showed that 70.56% of the time our automatic classification results were in agreement with developer opinions. Conclusion Our approach automatically classifies most of the change messages which record the cause of the software change and the method is applicable to cross-project analysis of software change messages. }
}
@article{SanJuan20061532,
  title = {Text mining without document context },
  journal = {Information Processing & Management },
  volume = {42},
  number = {6},
  pages = {1532 - 1552},
  year = {2006},
  note = {Special Issue on Informetrics },
  issn = {0306-4573},
  doi = {http://dx.doi.org/10.1016/j.ipm.2006.03.017},
  url = {http://www.sciencedirect.com/science/article/pii/S0306457306000380},
  author = {Eric SanJuan and Fidelia Ibekwe-SanJuan},
  keywords = {Multi-word term clustering},
  keywords = {Lexico-syntactic relations},
  keywords = {Text mining},
  keywords = {Informetrics},
  keywords = {Cluster evaluation },
  abstract = {We consider a challenging clustering task: the clustering of multi-word terms without document co-occurrence information in order to form coherent groups of topics. For this task, we developed a methodology taking as input multi-word terms and lexico-syntactic relations between them. Our clustering algorithm, named \{CPCL\} is implemented in the TermWatch system. We compared \{CPCL\} to other existing clustering algorithms, namely hierarchical and partitioning (k-means, k-medoids). This out-of-context clustering task led us to adapt multi-word term representation for statistical methods and also to refine an existing cluster evaluation metric, the editing distance in order to evaluate the methods. Evaluation was carried out on a list of multi-word terms from the genomic field which comes with a hand built taxonomy. Results showed that while k-means and k-medoids obtained good scores on the editing distance, they were very sensitive to term length. \{CPCL\} on the other hand obtained a better cluster homogeneity score and was less sensitive to term length. Also, \{CPCL\} showed good adaptability for handling very large and sparse matrices. }
}
@article{Dawyndt2006410,
  title = {Mining fatty acid databases for detection of novel compounds in aerobic bacteria },
  journal = {Journal of Microbiological Methods },
  volume = {66},
  number = {3},
  pages = {410 - 433},
  year = {2006},
  note = {},
  issn = {0167-7012},
  doi = {http://dx.doi.org/10.1016/j.mimet.2006.01.008},
  url = {http://www.sciencedirect.com/science/article/pii/S0167701206000236},
  author = {Peter Dawyndt and Marc Vancanneyt and Cindy Snauwaert and Bernard De Baets and Hans De Meyer and Jean Swings},
  keywords = {Bacterial identification},
  keywords = {Data mining},
  keywords = {Fatty acid analysis},
  keywords = {Feature extraction},
  keywords = {Knowledge discovery in databases },
  abstract = {This study examines how the discriminatory power of an automated bacterial whole-cell fatty acid identification system can be significantly enhanced by exploring the vast amounts of information accumulated during 15 years of routine gas chromatographic analysis of the fatty acid content of aerobic bacteria. Construction of a global peak occurrence histogram based upon a large fatty acid database is shown to serve as a highly informative tool for assessing the delineation of the naming windows used during the automatic recognition of fatty acid compounds. Along the lines of this data mining application, it is suggested that several naming windows of the Sherlock \{MIS\} \{TSBA50\} peak naming method may need to be re-evaluated in order to fit more closely with the bulk of observed fatty acid profiles. At the same time, the global peak occurrence histogram has put forward the delineation of 32 new peak naming windows, accounting for a 26% increase in the total number of fatty acid features taken into account for bacterial identification. By scrutinizing the relationships between the newly delineated naming windows and the many taxonomic units covered within a proprietary fatty acid database, all new naming windows were proven to correspond with stable features of some specific groups of microorganisms. This latter analysis clearly underscores the impact of incorporating the new fatty acid compounds for improving the resolution of the bacterial identification system and endorses the applicability of knowledge discovery in databases within the field of microbiology. }
}
@article{Krause2013541,
  title = {Reality mining of animal social systems },
  journal = {Trends in Ecology & Evolution },
  volume = {28},
  number = {9},
  pages = {541 - 551},
  year = {2013},
  note = {},
  issn = {0169-5347},
  doi = {http://dx.doi.org/10.1016/j.tree.2013.06.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0169534713001468},
  author = {Jens Krause and Stefan Krause and Robert Arlinghaus and Ioannis Psorakis and Stephen Roberts and Christian Rutz},
  keywords = {animal social network},
  keywords = {biologging},
  keywords = {biotelemetry},
  keywords = {group-living},
  keywords = {high-resolution data},
  keywords = {movement},
  keywords = {proximity logging},
  keywords = {social behaviour },
  abstract = {The increasing miniaturisation of animal-tracking technology has made it possible to gather exceptionally detailed machine-sensed data on the social dynamics of almost entire populations of individuals, in both terrestrial and aquatic study systems. Here, we review important issues concerning the collection of such data, and their processing and analysis, to identify the most promising approaches in the emerging field of ‘reality mining’. Automated technologies can provide data sensing at time intervals small enough to close the gap between social patterns and their underlying processes, providing insights into how social structures arise and change dynamically over different timescales. Especially in conjunction with experimental manipulations, reality mining promises significant advances in basic and applied research on animal social systems. }
}
@article{Tangherlini2013725,
  title = {Trawling in the Sea of the Great Unread: Sub-corpus topic modeling and Humanities research },
  journal = {Poetics },
  volume = {41},
  number = {6},
  pages = {725 - 749},
  year = {2013},
  note = {Topic Models and the Cultural Sciences },
  issn = {0304-422X},
  doi = {http://dx.doi.org/10.1016/j.poetic.2013.08.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0304422X13000648},
  author = {Timothy R. Tangherlini and Peter Leonard},
  keywords = {Topic modeling},
  keywords = {Literature},
  keywords = {Big data},
  keywords = {Search},
  keywords = {19th century },
  abstract = {Abstract Given a small, well-understood corpus that is of interest to a Humanities scholar, we propose sub-corpus topic modeling (STM) as a tool for discovering meaningful passages in a larger collection of less well-understood texts. \{STM\} allows Humanities scholars to discover unknown passages from the vast sea of works that Moretti calls the “great unread” and to significantly increase the researcher's ability to discuss aspects of influence and the development of intellectual movements across a broader swath of the literary landscape. In this article, we test three typical Humanities research problems: in the first, a researcher wants to find text passages that exhibit similarities to a collection of influential non literary texts from a single author (here, Darwin); in the second, a researcher wants to discover literary passages related to a well understood corpus of literary texts (here, emblematic texts from the Modern Breakthrough); and in the third, a researcher hopes to understand the influence that a particular domain (here, folklore) has had on the realm of literature over a series of decades. We explore these research challenges with three experiments. }
}
@article{Lifna201586,
  title = {Identifying Concept-drift in Twitter Streams },
  journal = {Procedia Computer Science },
  volume = {45},
  number = {0},
  pages = {86 - 94},
  year = {2015},
  note = {International Conference on Advanced Computing Technologies and Applications (ICACTA) },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2015.03.093},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050915003294},
  author = {C.S. Lifna and M. Vijayalakshmi},
  keywords = {Big Data},
  keywords = {Data Stream Mining ;Sliding Window},
  keywords = {Landmark Window},
  keywords = {Twitter},
  keywords = {Classification},
  keywords = {Topic ranking},
  keywords = {Concept-Drift. },
  abstract = {Abstract We live in a Big Data society, where the dignity of data is like exchange of currency. What we produce as data affords as access to different application, benefits, services, delivery etc… In today's world communication is mainly through social networking sites like, Twitter, Facebook, and Google+. Huge amount of data that is being generated and shared across these micro-blogging sites, serves as a good source of Big Data Streams for analysis. As the topic of discussion changes drastically, the relevance of data is temporal, which leads to concept-drift. Identification and handling of this concept-drift in such Big Data Streams is present area of interest. The state-of-the-art techniques for identifying trending topics in such data streams mainly concentrates on the frequency of the topic as the key parameter. Concentrating on such a weak indicator, reduces the precision of mining. This study puts forward a novel approach towards identifying concept-drift by initially grouping topics into classes and assigning weight-age for each class, using sliding window processing model upon Twitter streams. }
}
@article{Saitta2005289,
  title = {Data mining techniques for improving the reliability of system identification },
  journal = {Advanced Engineering Informatics },
  volume = {19},
  number = {4},
  pages = {289 - 298},
  year = {2005},
  note = {Computing in Civil Engineering 11th International Workshop of the European Group for Intelligent Computing in Engineering },
  issn = {1474-0346},
  doi = {http://dx.doi.org/10.1016/j.aei.2005.07.005},
  url = {http://www.sciencedirect.com/science/article/pii/S1474034605000625},
  author = {S. Saitta and B. Raphael and I.F.C. Smith},
  keywords = {Model–based diagnosis},
  keywords = {System identification},
  keywords = {Data mining },
  abstract = {A system identification methodology that makes use of data mining techniques to improve the reliability of identification is presented in this paper. An important aspect of the methodology is the generation of a population of candidate models. Indications of the reliability of system identification are obtained through an examination of the characteristics of the population. Data mining techniques bring out model characteristics that are important. The methodology has been applied to several engineering systems. }
}
@article{Veiga20137132,
  title = {The \{ECHORD\} project proposals analysis – Research profiles, collaboration patterns and research topic trends },
  journal = {Expert Systems with Applications },
  volume = {40},
  number = {17},
  pages = {7132 - 7140},
  year = {2013},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2013.06.046},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417413004417},
  author = {Germano Veiga and Cristovão Silva and Ricardo Araújo and Norberto Pires and Bruno Siciliano},
  keywords = {Pattern collaboration analysis},
  keywords = {Research topic analysis},
  keywords = {European robotics},
  keywords = {Social network analysis },
  abstract = {Abstract This paper investigates the research profiles, collaboration patterns and research topic trends which can be identified in the proposals submitted to the \{ECHORD\} (European Clearing House for Open Robotics Development) \{FP7\} project. On a country level, clusters were identified and characterized by patterns of proposal production per inhabitant, score and international cooperation. Belgium and Sweden constitute a cluster characterized by high proposal production, with very high scores and extensive international collaboration. Belgium also excels from another cluster analysis, being as the only country where 100% of proposals involve industry–academia cooperation and obtain scores above 10. Other findings show that single partner proposals have significantly lower quality than multi-partner proposals but, on the other hand, the number of countries involved shows no influence on the quality of the proposals. Despite the high number of industrial participants present on the proposals, it is observed that they play secondary roles in the proposals, with a very low number projects leaded by companies. Also, it is observed that partnerships between research institutions (non-universities) are the most successful. Concerning topics of the proposals, the technology human–robot interface and the product vision robot for small-scale manufacturing are the most significant. Finally, the paper shows clusters of institutions extracted from the giant network of relations obtained from the \{ECHORD\} set of proposals. }
}
@article{Lee200679,
  title = {Mining association rules with multi-dimensional constraints },
  journal = {Journal of Systems and Software },
  volume = {79},
  number = {1},
  pages = {79 - 92},
  year = {2006},
  note = {},
  issn = {0164-1212},
  doi = {http://dx.doi.org/10.1016/j.jss.2005.03.005},
  url = {http://www.sciencedirect.com/science/article/pii/S016412120500018X},
  author = {Anthony J.T. Lee and Wan-chuen Lin and Chun-sheng Wang},
  keywords = {Data mining},
  keywords = {Association rule},
  keywords = {Constraint-based mining },
  abstract = {To improve the effectiveness and efficiency of mining tasks, constraint-based mining enables users to concentrate on mining their interested association rules instead of the complete set of association rules. Previously proposed methods are mainly contributed to handling a single constraint and only consider the constraints which are characterized by a single attribute value. In this paper, we propose an approach to mine association rules with multiple constraints constructed by multi-dimensional attribute values. Our proposed approach basically consists of three phases. First, we collect the frequent items and prune infrequent items according to the Apriori property. Second, we exploit the properties of the given constraints to prune search space or save constraint checking in the conditional databases. Third, for each itemset possible to satisfy the constraint, we generate its conditional database and perform the three phases in the conditional database recursively. Our proposed algorithms can exploit the properties of constraints to prune search space or save constraint checking. Therefore, our proposed algorithm is more efficient than the revised FP-growth and \{FIC\} algorithms. }
}
@article{Tseng20071216,
  title = {Text mining techniques for patent analysis },
  journal = {Information Processing & Management },
  volume = {43},
  number = {5},
  pages = {1216 - 1247},
  year = {2007},
  note = {Patent Processing },
  issn = {0306-4573},
  doi = {http://dx.doi.org/10.1016/j.ipm.2006.11.011},
  url = {http://www.sciencedirect.com/science/article/pii/S0306457306002020},
  author = {Yuen-Hsien Tseng and Chi-Jen Lin and Yu-I Lin},
  keywords = {Summarization},
  keywords = {Phrase extraction},
  keywords = {Co-word analysis},
  keywords = {Clustering},
  keywords = {Topic mapping },
  abstract = {Patent documents contain important research results. However, they are lengthy and rich in technical terminology such that it takes a lot of human efforts for analyses. Automatic tools for assisting patent engineers or decision makers in patent analysis are in great demand. This paper describes a series of text mining techniques that conforms to the analytical process used by patent analysts. These techniques include text segmentation, summary extraction, feature selection, term association, cluster generation, topic identification, and information mapping. The issues of efficiency and effectiveness are considered in the design of these techniques. Some important features of the proposed methodology include a rigorous approach to verify the usefulness of segment extracts as the document surrogates, a corpus- and dictionary-free algorithm for keyphrase extraction, an efficient co-word analysis method that can be applied to large volume of patents, and an automatic procedure to create generic cluster titles for ease of result interpretation. Evaluation of these techniques was conducted. The results confirm that the machine-generated summaries do preserve more important content words than some other sections for classification. To demonstrate the feasibility, the proposed methodology was applied to a real-world patent set for domain analysis and mapping, which shows that our approach is more effective than existing classification systems. The attempt in this paper to automate the whole process not only helps create final patent maps for topic analyses, but also facilitates or improves other patent analysis tasks such as patent classification, organization, knowledge sharing, and prior art searches. }
}
@article{Kim2013230,
  title = {Topic-Driven SocialRank: Personalized search result ranking by identifying similar, credible users in a social network },
  journal = {Knowledge-Based Systems },
  volume = {54},
  number = {0},
  pages = {230 - 242},
  year = {2013},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2013.09.011},
  url = {http://www.sciencedirect.com/science/article/pii/S095070511300289X},
  author = {Young An Kim and Gun Woo Park},
  keywords = {Topic-Driven SocialRank},
  keywords = {Similarity},
  keywords = {Credibility},
  keywords = {Social search},
  keywords = {\{SN\} },
  abstract = {Abstract A Social Network Service (SNS) is a type of popular, lifestyle Web service to connect a user with friends, and a user’s interest in Web search can affect her friends who have similar interests. If these users’ preferences can be tracked, we can show more relevant information following a user’s interests. In this paper, we propose the Topic-Driven SocialRank algorithm to show interest-driven search results with relevant Web content from friends using social contacts online by identifying similar, credible users. Our assumption is that credible users issue more relevant information. We observe that a user has certain common interest with her similar friends in the SN, and focus on identifying similar users who have high credibility and sharing their search experiences. Experimental validation shows that our method significantly outperforms the baseline method. Our method is potentially effective to find more relevant search results by implicit help of familiar, credible users. }
}
@article{Mothe2006460,
  title = {Combining mining and visualization tools to discover the geographic structure of a domain },
  journal = {Computers, Environment and Urban Systems },
  volume = {30},
  number = {4},
  pages = {460 - 484},
  year = {2006},
  note = {Geographic Information Retrieval (GIR) },
  issn = {0198-9715},
  doi = {http://dx.doi.org/10.1016/j.compenvurbsys.2005.09.004},
  url = {http://www.sciencedirect.com/science/article/pii/S0198971505000827},
  author = {Josiane Mothe and Claude Chrisment and Taoufiq Dkaki and Bernard Dousset and Saïd Karouach},
  keywords = {Information mining},
  keywords = {Data analysis},
  keywords = {Domain knowledge},
  keywords = {Knowledge discovery},
  keywords = {Information visualization},
  keywords = {Geographical maps },
  abstract = {Science monitoring is a core issue in the new world of business and research. Companies and institutes need to monitor the activities of their competitors, get information on the market, changing technologies or government policies. This paper presents the Tétralogie platform that is aimed at allowing a user to interactively discover trends in scientific research and communities from large textual collections that include information about geographical location. Tétralogie consists of several agents that communicate with each other on users’ demands in order to deliver results to them. Metadata and document content are extracted before being mined. Results are displayed in the form of histograms, networks and geographical maps; these complementary types of presentations increase the possibilities of analysis compared to the use of these tools separately. We illustrate the overall process through a case study of scientific literature analysis and show how the different agents can be combined to discover the structure of a domain. The system correctly predicts the country contribution to a field in future years and allows exploration of the relationships between countries. }
}
@article{Damigos2006234,
  title = {An overview of environmental valuation methods for the mining industry },
  journal = {Journal of Cleaner Production },
  volume = {14},
  number = {3–4},
  pages = {234 - 247},
  year = {2006},
  note = {Improving Environmental, Economic and Ethical Performance in the Mining Industry. Part 1. Environmental Management and Sustainable Development Improving Environmental, Economic and Ethical Performance in the Mining Industry. Part 1. Environmental Management and Sustainable Development },
  issn = {0959-6526},
  doi = {http://dx.doi.org/10.1016/j.jclepro.2004.06.005},
  url = {http://www.sciencedirect.com/science/article/pii/S095965260500034X},
  author = {D. Damigos},
  keywords = {Mining},
  keywords = {Environmental valuation},
  keywords = {Cost–benefit appraisal},
  keywords = {Natural resource damage },
  abstract = {During recent decades, public concern about the protection, preservation and restoration of the environment has increased. Towards this direction, legislative action on environmental issues that affect the mining industry has been taken worldwide. Within this new framework, two issues are of critical importance to the sector – namely, project appraisal by means of social cost–benefit analysis (SCBA), and natural resource damage assessments. In a vast majority of cases, both issues require analysis to be carried out in a way that the loss of natural resources and the effects of environmental degradation on human health and well-being are assessed in monetary terms. This paper focuses on the application of environmental valuation methods in mining. Principal legislative requirements are briefly described, some critical points of the most important valuation methods are outlined, and demonstrative examples are presented. }
}
@article{Dustdar2005129,
  title = {Mining of ad-hoc business processes with TeamLog },
  journal = {Data & Knowledge Engineering },
  volume = {55},
  number = {2},
  pages = {129 - 158},
  year = {2005},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2005.02.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X05000133},
  author = {Schahram Dustdar and Thomas Hoffmann and Wil van der Aalst},
  keywords = {Process mining},
  keywords = {Workflow management},
  keywords = {Business process management},
  keywords = {Business process analysis},
  keywords = {TeamLog},
  keywords = {Caramba},
  keywords = {\{EMiT\}},
  keywords = {MinSoN },
  abstract = {The design of workflows is a complicated task. In those cases where the control flow between activities cannot be modeled in advance but simply occurs during enactment time (run time), we speak of ad-hoc processes. Ad-hoc processes allow for the flexibility needed in real-life business processes. Since ad-hoc processes are highly dynamic, they represent one of the most difficult challenges, both, technically and conceptually. Caramba is one of the few process-aware collaboration systems allowing for ad-hoc processes. Unlike in classical workflow systems, the users are no longer restricted by the system. Therefore, it is interesting to study the actual way people and organizations work. In this paper, we propose process mining techniques and tools to analyze ad-hoc processes. We introduce process mining, discuss the concept of mining in the context of ad-hoc processes, and demonstrate a concrete application of the concept using Caramba, process mining tools such as \{EMiT\} and MinSoN, and a newly developed extraction tool named Teamlog. }
}
@article{Grant20131663,
  title = {Using heuristics to estimate an appropriate number of latent topics in source code analysis },
  journal = {Science of Computer Programming },
  volume = {78},
  number = {9},
  pages = {1663 - 1678},
  year = {2013},
  note = {},
  issn = {0167-6423},
  doi = {http://dx.doi.org/10.1016/j.scico.2013.03.015},
  url = {http://www.sciencedirect.com/science/article/pii/S0167642313000762},
  author = {Scott Grant and James R. Cordy and David B. Skillicorn},
  keywords = {Source code analysis},
  keywords = {Latent Dirichlet Allocation},
  keywords = {Latent topic model},
  keywords = {Code clusters },
  abstract = {Abstract Latent Dirichlet Allocation (LDA) is a data clustering algorithm that performs especially well for text documents. In natural-language applications it automatically finds groups of related words (called “latent topics”) and clusters the documents into sets that are about the same “topic”. \{LDA\} has also been applied to source code, where the documents are natural source code units such as methods or classes, and the words are the keywords, operators, and programmer-defined names in the code. The problem of determining a topic count that most appropriately describes a set of source code documents is an open problem. We address this empirically by constructing clusterings with different numbers of topics for a large number of software systems, and then use a pair of measures based on source code locality and topic model similarity to assess how well the topic structure identifies related source code units. Results suggest that the topic count required can be closely approximated using the number of software code fragments in the system. We extend these results to recommend appropriate topic counts for arbitrary software systems based on an analysis of a set of open source systems. }
}
@article{Hüllermeier2005387,
  title = {Fuzzy methods in machine learning and data mining: Status and prospects },
  journal = {Fuzzy Sets and Systems },
  volume = {156},
  number = {3},
  pages = {387 - 406},
  year = {2005},
  note = {40th Anniversary of Fuzzy Sets 40th Anniversary of Fuzzy Sets },
  issn = {0165-0114},
  doi = {http://dx.doi.org/10.1016/j.fss.2005.05.036},
  url = {http://www.sciencedirect.com/science/article/pii/S0165011405002861},
  author = {Eyke Hüllermeier},
  keywords = {Machine learning},
  keywords = {Data mining},
  keywords = {Knowledge discovery },
  abstract = {Over the past years, methods for the automated induction of models and the extraction of interesting patterns from empirical data have attracted considerable attention in the fuzzy set community. This paper briefly reviews some typical applications and highlights potential contributions that fuzzy set theory can make to machine learning, data mining, and related fields. The paper concludes with a critical consideration of recent developments and some suggestions for future research directions. }
}
@incollection{DiNunzio201435,
  title = {Chapter 2 - Picturing Bayesian Classifiers: A Visual Data Mining Approach to Parameters Optimization },
  editor = {Cen, Yanchang ZhaoYonghua },
  booktitle = {Data Mining Applications with R },
  publisher = {Academic Press},
  edition = {},
  address = {Boston},
  year = {2014},
  pages = {35 - 61},
  isbn = {978-0-12-411511-8},
  doi = {http://dx.doi.org/10.1016/B978-0-12-411511-8.00002-5},
  url = {http://www.sciencedirect.com/science/article/pii/B9780124115118000025},
  author = {Giorgio Maria Di Nunzio and Alessandro Sordoni},
  keywords = {Visualization System},
  keywords = {Naïve Bayes},
  keywords = {Smoothing},
  keywords = {Text classification },
  abstract = {Abstract The goal of supervised classification is to assign a new object to a class from a given set of classes based on the attribute values of this object and on a training set. Although “supervised,” classification algorithms provide only very limited forms of guidance by the user. Typically, the user selects the dataset and sets the values for some parameters of the algorithm, which are often difficult to determine a priori. We believe the user should be involved more interactively in the process of classification because, by providing adequate data and knowledge visualizations, the pattern recognition capabilities of the human can be used to increase the effectivity of classifier construction. Moreover, users often want to validate and explore the classifier model and its output. To address these issues, the classification system should have an intuitive and interactive explanation capability. We present a two-dimensional visualization tool for Bayesian classifiers that can help the user understand why a classifier makes the predictions it does given the vector of parameters in input. The user can interact with the classifier by: selecting different models and changing the parameters of the prior. To help people discover (sub)optimal parameters, we develop a visual interaction method that allows objects to be interactively analyzed. Finally, we present a case study to demonstrate the effectiveness of our solution in text classification. }
}
@article{Khan2014245,
  title = {TOM: Twitter opinion mining framework using hybrid classification scheme },
  journal = {Decision Support Systems },
  volume = {57},
  number = {0},
  pages = {245 - 257},
  year = {2014},
  note = {},
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/j.dss.2013.09.004},
  url = {http://www.sciencedirect.com/science/article/pii/S0167923613002327},
  author = {Farhan Hassan Khan and Saba Bashir and Usman Qamar},
  keywords = {Twitter},
  keywords = {Sentiment analysis},
  keywords = {Classification},
  keywords = {SentiWordNet},
  keywords = {Social network analysis},
  keywords = {Data sparsity },
  abstract = {Abstract Twitter has become one of the most popular micro-blogging platform recently. Millions of users can share their thoughts and opinions about different aspects and events on the micro-blogging platform. Therefore, Twitter is considered as a rich source of information for decision making and sentiment analysis. Sentiment analysis refers to a classification problem where the main focus is to predict the polarity of words and then classify them into positive and negative feelings with the aim of identifying attitude and opinions that are expressed in any form or language. Sentiment analysis over Twitter offers organisations a fast and effective way to monitor the publics' feelings towards their brand, business, directors, etc. A wide range of features and methods for training sentiment classifiers for Twitter datasets have been researched in recent years with varying results. The primary issues in previous techniques are classification accuracy, data sparsity and sarcasm, as they incorrectly classify most of the tweets with a very high percentage of tweets incorrectly classified as neutral. This research paper focuses on these problems and presents an algorithm for twitter feeds classification based on a hybrid approach. The proposed method includes various pre-processing steps before feeding the text to the classifier. Experimental results show that the proposed technique overcomes the previous limitations and achieves higher accuracy when compared to similar techniques. }
}
@article{Vavliakis20131,
  title = {Event identification in web social media through named entity recognition and topic modeling },
  journal = {Data & Knowledge Engineering },
  volume = {88},
  number = {0},
  pages = {1 - 24},
  year = {2013},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2013.08.006},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X13000827},
  author = {Konstantinos N. Vavliakis and Andreas L. Symeonidis and Pericles A. Mitkas},
  keywords = {Event identification},
  keywords = {Social media analysis},
  keywords = {Topic maps},
  keywords = {Peak detection},
  keywords = {Topic clustering },
  abstract = {Abstract The problem of identifying important online or real life events from large textual document streams that are freely available on the World Wide Web is increasingly gaining popularity, given the flourishing of the social web. An event triggers discussion and comments on the WWW, especially in the blogosphere and in microblogging services. Consequently, one should be able to identify the involved entities, topics, time, and location of events through the analysis of information publicly available on the web, create semantically rich representations of events, and then use this information to provide interesting results, or summarize news to users. In this paper, we define the concept of important event and propose an efficient methodology for performing event detection from large time-stamped web document streams. The methodology successfully integrates named entity recognition, dynamic topic map discovery, topic clustering, and peak detection techniques. In addition, we propose an efficient algorithm for detecting all important events from a document stream. We perform extensive evaluation of the proposed methodology and algorithm on a dataset of 7 million blogposts, as well as through an international social event detection challenge. The results provide evidence that our approach: a) accurately detects important events, b) creates semantically rich representations of the detected events, c) can be adequately parameterized to correspond to different social perceptions of the event concept, and d) is suitable for online event detection on very large datasets. The expected complexity of the online facet of the proposed algorithm is linear with respect to the number of documents in the data stream. }
}
@article{Lee200544,
  title = {Mining association rules with multiple minimum supports using maximum constraints },
  journal = {International Journal of Approximate Reasoning },
  volume = {40},
  number = {1–2},
  pages = {44 - 54},
  year = {2005},
  note = {Data Mining and Granular Computing },
  issn = {0888-613X},
  doi = {http://dx.doi.org/10.1016/j.ijar.2004.11.006},
  url = {http://www.sciencedirect.com/science/article/pii/S0888613X04001392},
  author = {Yeong-Chyi Lee and Tzung-Pei Hong and Wen-Yang Lin},
  keywords = {Data mining},
  keywords = {Multiple minimum supports},
  keywords = {Association rule},
  keywords = {Maximum constraint },
  abstract = {Data mining is the process of extracting desirable knowledge or interesting patterns from existing databases for specific purposes. Most of the previous approaches set a single minimum support threshold for all the items or itemsets. But in real applications, different items may have different criteria to judge its importance. The support requirements should then vary with different items. In this paper, we provide another point of view about defining the minimum supports of itemsets when items have different minimum supports. The maximum constraint is used, which is well explained and may be suitable to some mining domains. We then propose a simple algorithm based on the Apriori approach to find the large-itemsets and association rules under this constraint. The proposed algorithm is easy and efficient when compared to Wang et al.’s under the maximum constraint. The numbers of association rules and large itemsets obtained by the proposed mining algorithm using the maximum constraint are also less than those using the minimum constraint. Whether to adopt the proposed approach thus depends on the requirements of mining problems. Besides, the granular computing technique of bit strings is used to speed up the proposed data mining algorithm. }
}
@article{Lee2006142,
  title = {Mining protein–protein interaction information on the internet },
  journal = {Expert Systems with Applications },
  volume = {30},
  number = {1},
  pages = {142 - 148},
  year = {2006},
  note = {Intelligent Bioinformatics Systems },
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2005.09.083},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417405002496},
  author = {Hsi-Chieh Lee and Szu-Wei Huang and Eldon Y. Li},
  keywords = {Protein–protein interactions},
  keywords = {Rule-based system},
  keywords = {Data mining},
  keywords = {Information retrieval},
  keywords = {Keypage-based search },
  abstract = {In this study, a mining system is proposed for finding protein–protein interaction literatures from the databases on the Internet. In this system, we find out discriminating words for protein–protein interaction by way of statistics and the results from literatures. A threshold is also evaluated to check if a given literature is related to protein–protein interactions. In addition, a keypage-based search mechanism is used to find related papers for protein–protein interactions from a given document. To expand the search space and ensure better performance of the system, mechanisms for protein name identification and databases for protein names are also developed. The system is designed with a web-based user interface and a job-dispatching kernel. Experiments are conducted and the results have been checked by a biomedical expert. The experimental results indicate that by using the proposed mining system, it is helpful for researchers to find out protein–protein literatures from the overwhelming piece of information available on the biomedical databases on the Internet. }
}
@article{Duan20136094,
  title = {Web objectionable text content detection using topic modeling technique },
  journal = {Expert Systems with Applications },
  volume = {40},
  number = {15},
  pages = {6094 - 6104},
  year = {2013},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2013.05.032},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417413003291},
  author = {Jiangjiao Duan and Jianping Zeng},
  keywords = {Objectionable text},
  keywords = {Content filtering},
  keywords = {Semantic model},
  keywords = {Detection framework},
  keywords = {Topic model },
  abstract = {Abstract Web 2.0 technologies have made it easily for Web users to create and spread objectionable text content, which has been shown harmful to Web users, especially young children. Although detection methods based on key word list are superior in achieving faster detection and lower memory consumption, they fail to detect text content that is objectionable in semantic description. A framework that can perfectly integrate semantic model and detection method is proposed to perform probability inference for detecting this kind of Web text content. Based on the observation that an objectionable scene could be described by a set of sentences, a topic model which is learnt from the set is employed to act as a semantic model of the objectionable scene. For a given sentence, probability value which shows the likelihood of the sentence with respect to the model is calculated in the framework. Then we use a mapping function to transform the probability value into a new indicator which is convenient for making final decision. Extensive comparison experiments on two real world text sets show that the framework can effectively recognize semantic objectionable text, and both the detection rate and the false alarm rate are superior to those of traditional methods. }
}
@article{Liu2005630,
  title = {Incremental mining of information interest for personalized web scanning },
  journal = {Information Systems },
  volume = {30},
  number = {8},
  pages = {630 - 648},
  year = {2005},
  note = {},
  issn = {0306-4379},
  doi = {http://dx.doi.org/10.1016/j.is.2004.07.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0306437904000638},
  author = {Rey-Long Liu and Wan-Jung Lin},
  keywords = {Context of information interest},
  keywords = {Precise interest specifications},
  keywords = {Comprehensible interest specifications},
  keywords = {Incremental text mining},
  keywords = {Web scanning },
  abstract = {Businesses and people often organize their information of interest (IOI) into a hierarchy of folders (or categories). The personalized folder hierarchy provides a natural way for each of the users to manage and utilize his/her \{IOI\} (a folder corresponds to an interest type). Since the interest is relatively long-term, continuous web scanning is essential. It should be directed by precise and comprehensible specifications of the interest. A precise specification may direct the scanner to those spaces that deserve scanning, while a specification comprehensible to the user may facilitate manual refinement, and a specification comprehensible to information providers (e.g. Internet search engines) may facilitate the identification of proper seed sites to start scanning. However, expressing such specifications is quite difficult (and even implausible) for the user, since each interest type is often implicitly and collectively defined by the content (i.e. documents) of the corresponding folder, which may even evolve over time. In this paper, we present an incremental text mining technique to efficiently identify the user's current interest by mining the user's information folders. The specification mined for each interest type specifies the context of the interest type in conjunctive normal form, which is comprehensible to general users and information providers. The specification is also shown to be more precise in directing the scanner to those sites that are more likely to provide IOI. The user may thus maintain his/her folders and then constantly get IOI, without paying much attention to the difficult tasks of interest specification and seed identification. }
}
@article{Jang20137492,
  title = {Deep sentiment analysis: Mining the causality between personality-value-attitude for analyzing business ads in social media },
  journal = {Expert Systems with Applications },
  volume = {40},
  number = {18},
  pages = {7492 - 7503},
  year = {2013},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2013.06.069},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417413004685},
  author = {Haeng-Jin Jang and Jaemoon Sim and Yonnim Lee and Ohbyung Kwon},
  keywords = {Sentiment analysis},
  keywords = {Data analytics},
  keywords = {Value structure},
  keywords = {Euler graph},
  keywords = {Social media },
  abstract = {Abstract \{IT\} vendors routinely use social media such as YouTube not only to disseminate their \{IT\} product information, but also to acquire customer input efficiently as part of their market research strategies. Customer responses that appear in social media, however, are typically unstructured; thus, a fairly large data set is needed for meaningful analysis. Although identifying customers’ value structures and attitudes may be useful for developing targeted or niche markets, the unstructured and volume-heavy nature of customer data prohibits efficient and economical extraction of such information. Automatic extraction of customer information would be valuable in determining value structure and strength. This paper proposes an intelligent method of estimating causality between user profiles, value structures, and attitudes based on the replies and published content managed by open social network systems such as YouTube. To show the feasibility of the idea proposed in this paper, information richness and agility are used as underlying concepts to create performance measures based on media/information richness theory. The resulting deep sentiment analysis proves to be superior to legacy sentiment analysis tools for estimation of causality among the focal parameters. }
}
@article{Lee20061113,
  title = {Mining the customer credit using classification and regression tree and multivariate adaptive regression splines },
  journal = {Computational Statistics & Data Analysis },
  volume = {50},
  number = {4},
  pages = {1113 - 1130},
  year = {2006},
  note = {},
  issn = {0167-9473},
  doi = {http://dx.doi.org/10.1016/j.csda.2004.11.006},
  url = {http://www.sciencedirect.com/science/article/pii/S016794730400355X},
  author = {Tian-Shyug Lee and Chih-Chou Chiu and Yu-Chao Chou and Chi-Jie Lu},
  keywords = {Credit scoring},
  keywords = {Data mining},
  keywords = {Classification and regression tree},
  keywords = {Multivariate adaptive regression splines},
  keywords = {Support vector machine },
  abstract = {Credit scoring has become a very important task as the credit industry has been experiencing severe competition during the past few years. The artificial neural network is becoming a very popular alternative in credit scoring models due to its associated memory characteristic and generalization capability. However, the relative importance of potential input variables, long training process, and interpretative difficulties have often been criticized and hence limited its application in handling credit scoring problems. The objective of the proposed study is to explore the performance of credit scoring using two commonly discussed data mining techniques—classification and regression tree (CART) and multivariate adaptive regression splines (MARS). To demonstrate the effectiveness of credit scoring using \{CART\} and MARS, credit scoring tasks are performed on one bank credit card data set. As the results reveal, \{CART\} and \{MARS\} outperform traditional discriminant analysis, logistic regression, neural networks, and support vector machine (SVM) approaches in terms of credit scoring accuracy and hence provide efficient alternatives in implementing credit scoring tasks. }
}
@article{Brown2005309,
  title = {Landscape restoration following phosphate mining: 30 years of co-evolution of science, industry and regulation },
  journal = {Ecological Engineering },
  volume = {24},
  number = {4},
  pages = {309 - 329},
  year = {2005},
  note = {Wetland creation },
  issn = {0925-8574},
  doi = {http://dx.doi.org/10.1016/j.ecoleng.2005.01.014},
  url = {http://www.sciencedirect.com/science/article/pii/S092585740500039X},
  author = {Mark T. Brown},
  keywords = {Landscape},
  keywords = {Phosphate mining},
  keywords = {Ecosystems },
  abstract = {The restoration of phosphate mined lands in Florida is large scale, potentially covering over 300,000 acres (121,000 ha), and rivals other restoration efforts like the Florida Everglades in size and complexity. The issues surrounding mining and subsequent restoration of the landscape are global, national, and local in scale. The entire system of phosphate mining and restoration involves local citizens, governmental agencies, research scientists, and industry personnel in a program that might be seen as adaptive management. It is suggested that restoration is managing adaptive self-organization of the ecosystems and landscapes and that it is the domain of ecological engineering. The past 30 years of research concerning various aspects of landscape restoration after phosphate mining are elucidated, and the research's relationship to management and regulation are discussed. Finally, the complex issues that are inherent in large restoration programs are discussed and it is suggested that a cooperative environment and vision may be the key elements that are missing. }
}
@article{Yun2003181,
  title = {Mining association rules on significant rare data using relative support },
  journal = {Journal of Systems and Software },
  volume = {67},
  number = {3},
  pages = {181 - 191},
  year = {2003},
  note = {},
  issn = {0164-1212},
  doi = {http://dx.doi.org/10.1016/S0164-1212(02)00128-0},
  url = {http://www.sciencedirect.com/science/article/pii/S0164121202001280},
  author = {Hyunyoon Yun and Danshim Ha and Buhyun Hwang and Keun Ho Ryu},
  keywords = {Data mining},
  keywords = {Potential information},
  keywords = {Association rules},
  keywords = {Significant rare data },
  abstract = {Recently, data mining, a technique to analyze the stored data in large databases to discover potential information and knowledge, has been a popular topic in database research. In this paper, we study the techniques discovering the association rules which are one of these data mining techniques. And we propose a technique discovering the association rules for significant rare data that appear infrequently in the database but are highly associated with specific data. Furthermore, considering these significant rare data, we evaluate the performance of the proposed algorithm by comparing it with other existing algorithms for discovering the association rules. }
}
@article{Battistini2013147,
  title = {Web data mining for automatic inventory of geohazards at national scale },
  journal = {Applied Geography },
  volume = {43},
  number = {0},
  pages = {147 - 158},
  year = {2013},
  note = {},
  issn = {0143-6228},
  doi = {http://dx.doi.org/10.1016/j.apgeog.2013.06.012},
  url = {http://www.sciencedirect.com/science/article/pii/S014362281300146X},
  author = {Alessandro Battistini and Samuele Segoni and Goffredo Manzo and Filippo Catani and Nicola Casagli},
  keywords = {Geotagging},
  keywords = {News},
  keywords = {Geodatabase},
  keywords = {Geohazards},
  keywords = {Inventories},
  keywords = {WebGIS },
  abstract = {Abstract In this study, we present a fully automated procedure to analyze online news using data mining techniques. It is then used to compile and continually update a geohazard database. The procedure is based on new technologies that publish news on the internet, i.e., the news is analyzed, georeferenced and attributed to a category of geohazards (the current categories are landslides, floods and earthquakes). A continuous flow of georeferenced events is established to populate and update the geodatabase automatically and in near-real time. We tested the procedure for 2 years at a national scale, creating a geodatabase containing more than 20,000 news items concerning geohazards that occurred in Italy. This procedure enables continuous feedback from events in the real world, such that information about geohazards can be fully exploited rapidly (compared to traditional techniques based on remote sensing, field surveys and historical inventories). }
}
@article{Wu200571,
  title = {Database classification for multi-database mining },
  journal = {Information Systems },
  volume = {30},
  number = {1},
  pages = {71 - 88},
  year = {2005},
  note = {},
  issn = {0306-4379},
  doi = {http://dx.doi.org/10.1016/j.is.2003.10.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0306437903001017},
  author = {Xindong Wu and Chengqi Zhang and Shichao Zhang},
  keywords = {Database selection},
  keywords = {Classification},
  keywords = {Multi-database mining },
  abstract = {Many large organizations have multiple databases distributed in different branches, and therefore multi-database mining is an important task for data mining. To reduce the search cost in the data from all databases, we need to identify which databases are most likely relevant to a data mining application. This is referred to as database selection. For real-world applications, database selection has to be carried out multiple times to identify relevant databases that meet different applications. In particular, a mining task may be without reference to any specific application. In this paper, we present an efficient approach for classifying multiple databases based on their similarity between each other. Our approach is application-independent. }
}
@article{Sakurai200562,
  title = {An e-mail analysis method based on text mining techniques },
  journal = {Applied Soft Computing },
  volume = {6},
  number = {1},
  pages = {62 - 71},
  year = {2005},
  note = {},
  issn = {1568-4946},
  doi = {http://dx.doi.org/10.1016/j.asoc.2004.10.007},
  url = {http://www.sciencedirect.com/science/article/pii/S1568494604001115},
  author = {S. Sakurai and A. Suyama},
  keywords = {Text mining},
  keywords = {Fuzzy inductive learning},
  keywords = {E-mail},
  keywords = {Customer center },
  abstract = {This paper proposes a method employing text mining techniques to analyze e-mails collected at a customer center. The method uses two kinds of domain-dependent knowledge. One is a key concept dictionary manually provided by human experts. The other is a concept relation dictionary automatically acquired by a fuzzy inductive learning algorithm. The method inputs the subject and the body of an e-mail and decides a text class for the e-mail. Also, the method extracts key concepts from e-mails and presents their statistical information. This paper applies the method to three kinds of analysis tasks: a product analysis task, a contents analysis task, and an address analysis task. The results of numerical experiments indicate that acquired concept relation dictionaries correspond to the intuition of operators in the customer center and give highly precise ratios in the classification. }
}
@article{Lihui20051225,
  title = {Using Web structure and summarisation techniques for Web content mining },
  journal = {Information Processing & Management },
  volume = {41},
  number = {5},
  pages = {1225 - 1242},
  year = {2005},
  note = {},
  issn = {0306-4573},
  doi = {http://dx.doi.org/10.1016/j.ipm.2004.08.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0306457304001049},
  author = {Chen Lihui and Chue Wai Lian},
  keywords = {Knowledge representation of Web Documents},
  keywords = {Web structure},
  keywords = {Summarisation},
  keywords = {Web content mining},
  keywords = {Content-based automatic Web Document clustering },
  abstract = {The dynamic nature and size of the Internet can result in difficulty finding relevant information. Most users typically express their information need via short queries to search engines and they often have to physically sift through the search results based on relevance ranking set by the search engines, making the process of relevance judgement time-consuming. In this paper, we describe a novel representation technique which makes use of the Web structure together with summarisation techniques to better represent knowledge in actual Web Documents. We named the proposed technique as Semantic Virtual Document (SVD). We will discuss how the proposed \{SVD\} can be used together with a suitable clustering algorithm to achieve an automatic content-based categorization of similar Web Documents. The auto-categorization facility as well as a “Tree-like” Graphical User Interface (GUI) for post-retrieval document browsing enhances the relevance judgement process for Internet users. Furthermore, we will introduce how our cluster-biased automatic query expansion technique can be used to overcome the ambiguity of short queries typically given by users. We will outline our experimental design to evaluate the effectiveness of the proposed \{SVD\} for representation and present a prototype called iSEARCH (Intelligent \{SEarch\} And Review of Cluster Hierarchy) for Web content mining. Our results confirm, quantify and extend previous research using Web structure and summarisation techniques, introducing novel techniques for knowledge representation to enhance Web content mining. }
}
@article{daSilva2005791,
  title = {Distributed data mining and agents },
  journal = {Engineering Applications of Artificial Intelligence },
  volume = {18},
  number = {7},
  pages = {791 - 807},
  year = {2005},
  note = {},
  issn = {0952-1976},
  doi = {http://dx.doi.org/10.1016/j.engappai.2005.06.004},
  url = {http://www.sciencedirect.com/science/article/pii/S095219760500076X},
  author = {Josenildo C. da Silva and Chris Giannella and Ruchita Bhargava and Hillol Kargupta and Matthias Klusch},
  keywords = {Multi-agent systems},
  keywords = {Distributed data mining},
  keywords = {Clustering },
  abstract = {Multi-agent systems (MAS) offer an architecture for distributed problem solving. Distributed data mining (DDM) algorithms focus on one class of such distributed problem solving tasks—analysis and modeling of distributed data. This paper offers a perspective on \{DDM\} algorithms in the context of multi-agents systems. It discusses broadly the connection between \{DDM\} and MAS. It provides a high-level survey of DDM, then focuses on distributed clustering algorithms and some potential applications in multi-agent-based problem solving scenarios. It reviews algorithms for distributed clustering, including privacy-preserving ones. It describes challenges for clustering in sensor-network environments, potential shortcomings of the current algorithms, and future work accordingly. It also discusses confidentiality (privacy preservation) and presents a new algorithm for privacy-preserving density-based clustering. }
}
@article{Jun2013789,
  title = {Mining Explainable User Interests from Scalable User Behavior Data },
  journal = {Procedia Computer Science },
  volume = {17},
  number = {0},
  pages = {789 - 796},
  year = {2013},
  note = {First International Conference on Information Technology and Quantitative Management },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2013.05.101},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050913002342},
  author = {Li Jun and Zhang Peng},
  keywords = {Behavior Targeting},
  keywords = {Probabilistic Latent Semantic Analysis},
  keywords = {User Category },
  abstract = {Abstract Capturing user interests from big user behavior data is critical for online advertising. Based on the user interests, advertisers can significantly reduce their advertising cost by delivering the most relevant ads for the user. The state-of-the-art user Behavior Targeting (BT) models treat user behaviors as documents, and thus use topic models to extract their interests. A limitation of these methods is that user behaviors are usually described as unexplainable hidden topics, which cannot be directly used to guide online advertising. To this end, we propose in this paper a systematic User Interest Distribution Mining (UIDM for short) Framework to extract explainable user interests from big user behavior data. In the solution, we first use the Probabilistic Latent Semantic Analysis (PLSA) to discover the relationship between users and their behaviors, which can be described as hidden topics. Then, we construct a mapping matrix between the hidden topics and user interests by manually labeling a feature entity matrix. Experiments on real-world data sets demonstrate the performance of the proposed method. }
}
@article{Hong20041,
  title = {A fuzzy AprioriTid mining algorithm with reduced computational time },
  journal = {Applied Soft Computing },
  volume = {5},
  number = {1},
  pages = {1 - 10},
  year = {2004},
  note = {},
  issn = {1568-4946},
  doi = {http://dx.doi.org/10.1016/j.asoc.2004.03.009},
  url = {http://www.sciencedirect.com/science/article/pii/S1568494604000511},
  author = {Tzung-Pei Hong and Chan-Sheng Kuo and Shyue-Liang Wang},
  keywords = {Data mining},
  keywords = {Fuzzy set},
  keywords = {Association rule},
  keywords = {Transaction},
  keywords = {Quantitative value },
  abstract = {Due to the increasing use of very large databases and data warehouses, mining useful information and helpful knowledge from transactions is evolving into an important research area. Most of conventional data mining algorithms identify the relation among transactions with binary values. Transactions with quantitative values are, however, commonly seen in real world applications. In the past, we proposed a fuzzy mining algorithm based on the Apriori approach to explore interesting knowledge from the transactions with quantitative values. This paper proposes another new fuzzy mining algorithm based on the AprioriTid approach to find fuzzy association rules from given quantitative transactions. Each item uses only the linguistic term with the maximum cardinality in later mining processes, thus making the number of fuzzy regions to be processed the same as that of the original items. The algorithm therefore focuses on the most important linguistic terms for reduced time complexity. Experimental results from the data in a supermarket of a department store show the feasibility of the proposed mining algorithm. }
}
@article{Toroslu2005395,
  title = {Data mining in deductive databases using query flocks },
  journal = {Expert Systems with Applications },
  volume = {28},
  number = {3},
  pages = {395 - 407},
  year = {2005},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2004.12.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417404001459},
  author = {Ismail H. Toroslu and Meliha Yetisgen-Yildiz},
  keywords = {Data mining},
  keywords = {Association rule mining},
  keywords = {Query flock},
  keywords = {Deductive databases},
  keywords = {Recursive query evaluation },
  abstract = {Data mining can be defined as a process for finding trends and patterns in large data. An important technique for extracting useful information, such as regularities, from usually historical data, is called as association rule mining. Most research on data mining is concentrated on traditional relational data model. On the other hand, the query flocks technique, which extends the concept of association rule mining with a ‘generate-and-test’ model for different kind of patterns, can also be applied to deductive databases. In this paper, query flocks technique is extended with view definitions including recursive views. Although in our system query flock technique can be applied to a data base schema including both the intensional data base (IDB) or rules and the extensible data base (EDB) or tabled relations, we have designed an architecture to compile query flocks from datalog into \{SQL\} in order to be able to use commercially available data base management systems (DBMS) as an underlying engine of our system. However, since recursive datalog views (IDB's) cannot be converted directly into \{SQL\} statements, they are materialized before the final compilation operation. On this architecture, optimizations suitable for the extended query flocks are also introduced. Using the prototype system, which is developed on a commercial database environment, advantages of the new architecture together with the optimizations, are also presented. }
}
@incollection{Chuvakin2013193,
  title = {Chapter 11 - Log Data Mining },
  editor = {Chuvakin, Anton and Schmidt, Kevin  and Phillips, Chris },
  booktitle = {Logging and Log Management },
  publisher = {Syngress},
  edition = {},
  address = {Boston},
  year = {2013},
  pages = {193 - 205},
  isbn = {978-1-59749-635-3},
  doi = {http://dx.doi.org/10.1016/B978-1-59-749635-3.00011-7},
  url = {http://www.sciencedirect.com/science/article/pii/B9781597496353000117},
  author = {Anton Chuvakin and Kevin Schmidt and Chris Phillips},
  abstract = {Abstract This chapter is devoted to log mining or log knowledge discovery—a different type of log analysis, which does not rely on knowing what to look for. This takes the “high art” of log analysis to the next level by breaking the dependence on the lists of strings or patterns to look for in the logs. Keywords Logs, Logging, Log analysis, Log management, Data mining, Text mining }
}
@article{Li2004207,
  title = {Web mining model and its applications for information gathering },
  journal = {Knowledge-Based Systems },
  volume = {17},
  number = {5–6},
  pages = {207 - 217},
  year = {2004},
  note = {Special Issue: Web Intelligence },
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2004.05.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705104000280},
  author = {Yuefeng Li and Ning Zhong},
  keywords = {Web intelligence},
  keywords = {Semantic Web},
  keywords = {Web mining},
  keywords = {Information gathering },
  abstract = {Web mining is used to automatically discover and extract information from Web-related data sources such as documents, log, services, and user profiles. Although standard data mining methods may be applied for mining on the Web, many specific algorithms need to be developed and applied for various purposes of Web based information processing in multiple Web resources, effectively and efficiently. In the paper, we propose an abstract Web mining model for extracting approximate concepts hidden in user profiles on the semantic Web. The abstract Web mining model represents knowledge on user profiles by using an ontology which consists of both ‘part-of’ and ‘is-a’ relations. We also describe the details of using the abstract Web mining model for information gathering. In this application, classes of the ontology are represented as subsets of a list of keywords. An efficient filtering algorithm is also developed to filter out most non-relevant inputs. }
}
@article{Yang2005723,
  title = {A text mining approach for automatic construction of hypertexts },
  journal = {Expert Systems with Applications },
  volume = {29},
  number = {4},
  pages = {723 - 734},
  year = {2005},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2005.05.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417405000990},
  author = {Hsin-Chang Yang and Chung-Hong Lee},
  keywords = {Automatic hypertext construction},
  keywords = {Self-organizing maps},
  keywords = {Text mining },
  abstract = {The research on automatic hypertext construction emerges rapidly in the last decade because there exists a urgent need to translate the gigantic amount of legacy documents into web pages. Unlike traditional ‘flat’ texts, a hypertext contains a number of navigational hyperlinks that point to some related hypertexts or locations of the same hypertext. Traditionally, these hyperlinks were constructed by the creators of the web pages with or without the help of some authoring tools. However, the gigantic amount of documents produced each day prevent from such manual construction. Thus an automatic hypertext construction method is necessary for content providers to efficiently produce adequate information that can be used by web surfers. Although most of the web pages contain a number of non-textual data such as images, sounds, and video clips, text data still contribute the major part of information about the pages. Therefore, it is not surprising that most of automatic hypertext construction methods inherit from traditional information retrieval research. In this work, we will propose a new automatic hypertext construction method based on a text mining approach. Our method applies the self-organizing map algorithm to cluster some at text documents in a training corpus and generate two maps. We then use these maps to identify the sources and destinations of some important hyperlinks within these training documents. The constructed hyperlinks are then inserted into the training documents to translate them into hypertext form. Such translated documents will form the new corpus. Incoming documents can also be translated into hypertext form and added to the corpus through the same approach. Our method had been tested on a set of at text documents collected from a newswire site. Although we only use Chinese text documents, our approach can be applied to any documents that can be transformed to a set of index terms. }
}
@article{Purohit20132438,
  title = {What kind of #conversation is Twitter? Mining #psycholinguistic cues for emergency coordination },
  journal = {Computers in Human Behavior },
  volume = {29},
  number = {6},
  pages = {2438 - 2447},
  year = {2013},
  note = {},
  issn = {0747-5632},
  doi = {http://dx.doi.org/10.1016/j.chb.2013.05.007},
  url = {http://www.sciencedirect.com/science/article/pii/S0747563213001581},
  author = {Hemant Purohit and Andrew Hampton and Valerie L. Shalin and Amit P. Sheth and John Flach and Shreyansh Bhatt},
  keywords = {Coordinated behavior},
  keywords = {Conversation analysis},
  keywords = {Information filtering},
  keywords = {Disaster response},
  keywords = {Twitter },
  abstract = {Abstract The information overload created by social media messages in emergency situations challenges response organizations to find targeted content and users. We aim to select useful messages by detecting the presence of conversation as an indicator of coordinated citizen action. Using simple linguistic indicators drawn from conversation analysis in social science, we model the presence of coordination in the communication landscape of Twitter1 Popular Microblogging Service: http://www.twitter.com. 1 using a corpus of 1.5 million tweets for various disaster and non-disaster events spanning different periods, lengths of time, and varied social significance. Within replies, retweets and tweets that mention other Twitter users, we found that domain-independent, linguistic cues distinguish likely conversation from non-conversation in this online form of mediated communication. We demonstrate that these likely conversation subsets potentially contain more information than non-conversation subsets, whether or not the tweets are replies, retweets, or mention other Twitter users, as long as they reflect conversational properties. From a practical perspective, we have developed a model for trimming the candidate tweet corpus to identify a much smaller subset of data for submission to deeper, domain-dependent semantic analyses for the identification of actionable information nuggets for coordinated emergency response. }
}
@article{Kanevski2004845,
  title = {Environmental data mining and modeling based on machine learning algorithms and geostatistics },
  journal = {Environmental Modelling & Software },
  volume = {19},
  number = {9},
  pages = {845 - 855},
  year = {2004},
  note = {Environmental Sciences and Artificial Intelligence },
  issn = {1364-8152},
  doi = {http://dx.doi.org/10.1016/j.envsoft.2003.03.004},
  url = {http://www.sciencedirect.com/science/article/pii/S1364815203002032},
  author = {M. Kanevski and R. Parkin and A. Pozdnukhov and V. Timonin and M. Maignan and V. Demyanov and S. Canu},
  keywords = {Environmental data mining and assimilation},
  keywords = {Geostatistics},
  keywords = {Machine learning},
  keywords = {Stochastic simulation},
  keywords = {Radioactive pollution },
  abstract = {The paper presents some contemporary approaches to spatial environmental data analysis. The main topics are concentrated on the decision-oriented problems of environmental spatial data mining and modeling: valorization and representativity of data with the help of exploratory data analysis, spatial predictions, probabilistic and risk mapping, development and application of conditional stochastic simulation models. The innovative part of the paper presents integrated/hybrid model—machine learning (ML) residuals sequential simulations—MLRSS. The models are based on multilayer perceptron and support vector regression \{ML\} algorithms used for modeling long-range spatial trends and sequential simulations of the residuals. \{ML\} algorithms deliver non-linear solution for the spatial non-stationary problems, which are difficult for geostatistical approach. Geostatistical tools (variography) are used to characterize performance of \{ML\} algorithms, by analyzing quality and quantity of the spatially structured information extracted from data with \{ML\} algorithms. Sequential simulations provide efficient assessment of uncertainty and spatial variability. Case study from the Chernobyl fallouts illustrates the performance of the proposed model. It is shown that probability mapping, provided by the combination of \{ML\} data driven and geostatistical model based approaches, can be efficiently used in decision-making process. }
}
@article{Jingxiang20091081,
  title = {Advanced \{GNSS\} technology of mining deformation monitoring },
  journal = {Procedia Earth and Planetary Science },
  volume = {1},
  number = {1},
  pages = {1081 - 1088},
  year = {2009},
  note = {special issue title: Proceedings of the International Conference on Mining Science & Technology (ICMST2009) },
  issn = {1878-5220},
  doi = {http://dx.doi.org/10.1016/j.proeps.2009.09.166},
  url = {http://www.sciencedirect.com/science/article/pii/S1878522009001672},
  author = {Gao Jing-xiang and Hu Hong},
  keywords = {mine surveying},
  keywords = {GPS/Pseudolites (PLs)},
  keywords = {empirical mode decomposition (EMD)},
  keywords = {wavelet },
  abstract = {Innovation of the mining deformation monitoring technology has been aroused with the development of \{GNSS\} technology. This paper probe into the feature of mine Continuously Operating Reference Station (CORS) and precise height difference model was constructed in Yanzhou and inner Mongolia to practise the technology. Based on the technology of mining CORS, we proposed the mining 3D deformation monitoring technology of large area. Based on the technology of RTK, we proposed the mining subsidence monitoring technology of small area. Based on the construction of the model of height difference, GNSS-RTK technology was proposed to be used in subsidence monitoring. This paper provides specific ways to the modernization \{GNSS\} monitoring technology in mining deformation. }
}
@article{Muduli2013335,
  title = {Barriers to green supply chain management in Indian mining industries: a graph theoretic approach },
  journal = {Journal of Cleaner Production },
  volume = {47},
  number = {0},
  pages = {335 - 344},
  year = {2013},
  note = {Cleaner Production: initiatives and challenges for a sustainable world \{CP\} Initiatives & Challenges },
  issn = {0959-6526},
  doi = {http://dx.doi.org/10.1016/j.jclepro.2012.10.030},
  url = {http://www.sciencedirect.com/science/article/pii/S0959652612005628},
  author = {Kamalakanta Muduli and Kannan Govindan and Akhilesh Barve and Yong Geng},
  keywords = {\{GSCM\}},
  keywords = {\{GTMA\}},
  keywords = {Permanent},
  keywords = {Diagraph},
  keywords = {Barriers to \{GSCM\} implementation },
  abstract = {A country's mining industry, despite its significant contributions to the country's economic growth, generally has a very poor public image because it is considered as a major environmental polluter. To acquire an improved social image, as well as to comply with government regulations, mining industries are increasingly implementing environmental management systems (EMS), cleaner production (CP), and adopting green supply chain management (GSCM) practices. \{GSCM\} focuses on a reduction of the adverse impacts of supply chain activities as well as a minimization of energy and material usage. This study focuses on the mining industry as a case study by which we will identify factors and sub-factors hindering \{GSCM\} implementation. A graph theoretic and matrix approach (GTMA) has been used to quantify the adverse impact of these barriers on \{GSCM\} implementation. An assessment of the inhibiting strength of the barriers will help decision makers rank them and decide a course of action that will make an optimum utilization of available resources during times of resource scarcity. }
}
@article{Bahn2013129,
  title = {Workplace hazard identification and management: The case of an underground mining operation },
  journal = {Safety Science },
  volume = {57},
  number = {0},
  pages = {129 - 137},
  year = {2013},
  note = {},
  issn = {0925-7535},
  doi = {http://dx.doi.org/10.1016/j.ssci.2013.01.010},
  url = {http://www.sciencedirect.com/science/article/pii/S0925753513000246},
  author = {Susanne Bahn},
  keywords = {Work-related injury},
  keywords = {Hazard identification},
  keywords = {Workplace safety},
  keywords = {Safety culture},
  keywords = {Action research },
  abstract = {This paper uses the findings from two workshops conducted with 77 employees of an underground mining operation in Western Australia in April and May 2011. Risk management requires all managers and employees to identify hazards in their work environments. Managers assume that their employees have sufficient knowledge and skills to successfully identify not only obvious but also emerging hazards. For this study, two workshops were conducted using an action research methodology. In the first workshop, “Hazard Identification” it was found that the range of workplace hazards the staff could identify was extensive by some groups and very limited by others. For example length of experience underground did not predetermine an ability to identify hazards. Some of the longest serving and those in supervisory positions identified few hazards. Most teams identified 8–12 hazards under each of four categories within a typology: obvious, trivial, emerging and hidden hazards. However, the team with the least experience were unable to identify more than four obvious, two trivial, five emerging and three hidden hazards in their work areas. In workshop two, “Managing Workplace Hazards”, the teams showed a range of abilities to complete the task with one team (with an average 12 years experience underground) unable to identify any strategies to control the list of emerging hazards and one team of managers displaying limited skills. Given these results there is a need to provide further training for all managers and employees in hazard identification and management. }
}
@article{Hong20132361,
  title = {Multimedia encyclopedia construction by mining web knowledge },
  journal = {Signal Processing },
  volume = {93},
  number = {8},
  pages = {2361 - 2368},
  year = {2013},
  note = {Indexing of Large-Scale Multimedia Signals },
  issn = {0165-1684},
  doi = {http://dx.doi.org/10.1016/j.sigpro.2012.06.028},
  url = {http://www.sciencedirect.com/science/article/pii/S0165168412002204},
  author = {Richang Hong and Zheng-Jun Zha and Yue Gao and Tat-Seng Chua and Xindong Wu},
  keywords = {Web knowledge},
  keywords = {Multimedia encyclopedia },
  abstract = {In recent years, we have witnessed the blooming of Web 2.0 content such as Wikipedia, Flickr and YouTube, etc. How might we benefit from such rich media resources available on the internet? This paper presents a novel concept called Mediapedia, a dynamic multimedia encyclopedia that takes advantage of, and in fact is built from the text and image resources on the Web. The Mediapedia distinguishes itself from the traditional encyclopedia in four main ways. (1) It tries to present users with multimedia contents (e.g., text, image, video) which we believed are more intuitive and informative to users. (2) It is fully automated because it downloads the media contents as well as the corresponding textual descriptions from the Web and assembles them for presentation. (3) It is dynamic as it will use the latest multimedia content to compose the answer. This is not true for the traditional encyclopedia. (4) The design of Mediapedia is flexible and extensible such that we can easily incorporate new kinds of mediums such as video and languages into the framework. The effectiveness of Mediapedia is demonstrated and two potential applications are described in this paper. }
}
@article{Wang2013629,
  title = {Biomedical time series clustering based on non-negative sparse coding and probabilistic topic model },
  journal = {Computer Methods and Programs in Biomedicine },
  volume = {111},
  number = {3},
  pages = {629 - 641},
  year = {2013},
  note = {},
  issn = {0169-2607},
  doi = {http://dx.doi.org/10.1016/j.cmpb.2013.05.022},
  url = {http://www.sciencedirect.com/science/article/pii/S016926071300179X},
  author = {Jin Wang and Ping Liu and Mary F.H.She and Saeid Nahavandi and Abbas Kouzani},
  keywords = {Unsupervised learning},
  keywords = {Bag-of-Words},
  keywords = {Probabilistic topic model},
  keywords = {Sparse coding },
  abstract = {Abstract Biomedical time series clustering that groups a set of unlabelled temporal signals according to their underlying similarity is very useful for biomedical records management and analysis such as biosignals archiving and diagnosis. In this paper, a new framework for clustering of long-term biomedical time series such as electrocardiography (ECG) and electroencephalography (EEG) signals is proposed. Specifically, local segments extracted from the time series are projected as a combination of a small number of basis elements in a trained dictionary by non-negative sparse coding. A Bag-of-Words (BoW) representation is then constructed by summing up all the sparse coefficients of local segments in a time series. Based on the BoW representation, a probabilistic topic model that was originally developed for text document analysis is extended to discover the underlying similarity of a collection of time series. The underlying similarity of biomedical time series is well captured attributing to the statistic nature of the probabilistic topic model. Experiments on three datasets constructed from publicly available \{EEG\} and \{ECG\} signals demonstrates that the proposed approach achieves better accuracy than existing state-of-the-art methods, and is insensitive to model parameters such as length of local segments and dictionary size. }
}
@article{Zhang2004109,
  title = {A multiagent data warehousing (MADWH) and multiagent data mining (MADM) approach to brain modeling and neurofuzzy control },
  journal = {Information Sciences },
  volume = {167},
  number = {1–4},
  pages = {109 - 127},
  year = {2004},
  note = {},
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2003.05.011},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025503004055},
  author = {Wen-Ran Zhang and Lulu Zhang},
  keywords = {Multiagent data warehousing and mining},
  keywords = {Agent similarity and orthogonality},
  keywords = {Neurofuzzy agents},
  keywords = {Mining agent association in first-order logic},
  keywords = {Brain modeling},
  keywords = {Robot control },
  abstract = {Based on the hypothesis that the brain is a society of semiautonomous neural agents and full autonomy is the result of coordination of semiautonomous functionalities, a multiagent data warehousing (MADWH) and multiagent data mining (MADM) approach is presented for brain modeling and illustrated with robot motion control. An algorithm named Neighbor-Miner is proposed for \{MADWH\} and MADM. The algorithm is defined in an evolving dynamic environment with semiautonomous neurofuzzy agents. Instead of mining frequent itemsets from customer transactions, the new algorithm discovers new neurofuzzy agents and mines agent associations in first-order logic for coordination that was once considered impossible in traditional data mining. While the Apriori algorithm uses frequency as a priori threshold, Neighbor-Miner uses agent similarity as a priori knowledge. The concept of agent similarity leads to the notions of agent cuboid, orthogonal MADWH, and MADM. Based on agent similarities and action similarities, Neighbor-Miner is presented and illustrated with brain modeling for robot control. The novelty of a multiagent data warehouse lies in its ability to systematically combine neurofuzzy systems, multiagent systems, database systems, machine learning, data mining, information theory, neuroscience, decision, cognition, and control all together into a modern multidimensional information system architecture that is ideal for brain modeling of different animal species with manageable complexity. Although examples in robot control are used to illustrate the basic ideas, the new approach is generally suitable for data mining tasks where knowledge can be discovered collectively by a set of similar semiautonomous or autonomous agents from a geographically, geometrically, or timely distributed environment, especially in high-dimensional scientific and engineering data environments. }
}
@article{Cano2006323,
  title = {On the combination of evolutionary algorithms and stratified strategies for training set selection in data mining },
  journal = {Applied Soft Computing },
  volume = {6},
  number = {3},
  pages = {323 - 332},
  year = {2006},
  note = {},
  issn = {1568-4946},
  doi = {http://dx.doi.org/10.1016/j.asoc.2005.02.006},
  url = {http://www.sciencedirect.com/science/article/pii/S1568494605000220},
  author = {José Ramón Cano and Francisco Herrera and Manuel Lozano},
  keywords = {Evolutionary algorithms},
  keywords = {Stratification},
  keywords = {Instance selection},
  keywords = {Training set selection},
  keywords = {Data mining },
  abstract = {In this paper, we present a new approach for training set selection in large size data sets. The algorithm consists on the combination of stratification and evolutionary algorithms. The stratification reduces the size of domain where the selection is applied while the evolutionary method selects the most representative instances. The performance of the proposal is compared with seven non-evolutionary algorithms, in stratified execution. The analysis follows two evaluating approaches: balance between reduction and accuracy of the subsets selected, and balance between interpretability and accuracy of the representation models associated to these subsets. The algorithms have been assessed on large and huge size data sets. The study shows that the stratified evolutionary instance selection consistently outperforms the non-evolutionary ones. The main advantages are: high instance reduction rates, high classification accuracy and models with high interpretability. }
}
@article{Hu2004409,
  title = {iProLINK: an integrated protein resource for literature mining },
  journal = {Computational Biology and Chemistry },
  volume = {28},
  number = {5–6},
  pages = {409 - 416},
  year = {2004},
  note = {},
  issn = {1476-9271},
  doi = {http://dx.doi.org/10.1016/j.compbiolchem.2004.09.010},
  url = {http://www.sciencedirect.com/science/article/pii/S1476927104000830},
  author = {Zhang-Zhi Hu and Inderjeet Mani and Vincent Hermoso and Hongfang Liu and Cathy H. Wu},
  keywords = {PubMed},
  keywords = {UniProt},
  keywords = {Literature mining},
  keywords = {Natural language processing},
  keywords = {Post-translation modifications},
  keywords = {Protein annotation },
  abstract = {The exponential growth of large-scale molecular sequence data and of the PubMed scientific literature has prompted active research in biological literature mining and information extraction to facilitate genome/proteome annotation and improve the quality of biological databases. Motivated by the promise of text mining methodologies, but at the same time, the lack of adequate curated data for training and benchmarking, the Protein Information Resource (PIR) has developed a resource for protein literature mining—iProLINK (integrated Protein Literature \{INformation\} and Knowledge). As \{PIR\} focuses its effort on the curation of the UniProt protein sequence database, the goal of iProLINK is to provide curated data sources that can be utilized for text mining research in the areas of bibliography mapping, annotation extraction, protein named entity recognition, and protein ontology development. The data sources for bibliography mapping and annotation extraction include mapped citations (PubMed \{ID\} to protein entry and feature line mapping) and annotation-tagged literature corpora. The latter includes several hundred abstracts and full-text articles tagged with experimentally validated post-translational modifications (PTMs) annotated in the \{PIR\} protein sequence database. The data sources for entity recognition and ontology development include a protein name dictionary, word token dictionaries, protein name-tagged literature corpora along with tagging guidelines, as well as a protein ontology based on \{PIRSF\} protein family names. iProLINK is freely accessible at http://pir.georgetown.edu/iprolink, with hypertext links for all downloadable files. }
}
@article{Takigawa201350,
  title = {Graph mining: procedure, application to drug discovery and recent advances },
  journal = {Drug Discovery Today },
  volume = {18},
  number = {1–2},
  pages = {50 - 57},
  year = {2013},
  note = {},
  issn = {1359-6446},
  doi = {http://dx.doi.org/10.1016/j.drudis.2012.07.016},
  url = {http://www.sciencedirect.com/science/article/pii/S1359644612002759},
  author = {Ichigaku Takigawa and Hiroshi Mamitsuka},
  abstract = {Combinatorial chemistry has generated chemical libraries and databases with a huge number of chemical compounds, which include prospective drugs. Chemical structures of compounds can be molecular graphs, to which a variety of graph-based techniques in computer science, specifically graph mining, can be applied. The most basic way for analyzing molecular graphs is using structural fragments, so-called subgraphs in graph theory. The mainstream technique in graph mining is frequent subgraph mining, by which we can retrieve essential subgraphs in given molecular graphs. In this article we explain the idea and procedure of mining frequent subgraphs from given molecular graphs, raising some real applications, and we describe the recent advances of graph mining. }
}
@article{LeBer2006170,
  title = {Studying crop sequences with CarrotAge, a HMM-based data mining software },
  journal = {Ecological Modelling },
  volume = {191},
  number = {1},
  pages = {170 - 185},
  year = {2006},
  note = {Selected Papers from the Fourth International Workshop on Environmental Applications of Machine Learning, September 27 - October 1, 2004, Bled, Slovenia },
  issn = {0304-3800},
  doi = {http://dx.doi.org/10.1016/j.ecolmodel.2005.08.031},
  url = {http://www.sciencedirect.com/science/article/pii/S0304380005003844},
  author = {F. Le Ber and M. Benoît and C. Schott and J.-F. Mari and C. Mignolet},
  keywords = {Data mining},
  keywords = {Land use},
  keywords = {Crop sequences},
  keywords = {Hidden Markov models },
  abstract = {We have developed a knowledge discovery system based on high-order hidden Markov models for analyzing spatio-temporal data bases. This system, named CarrotAge , takes as input an array of discrete data – the rows represent the spatial sites and the columns the time slots – and builds a partition together with its a posteriori probability. CarrotAge has been developed for studying the cropping patterns of a territory. It uses therefore an agricultural drench database, named Ter-Uti , which records every year the land-use category of a set of sites regularly spaced. The results of CarrotAge are interpreted by agronomists and used in research works linking agricultural land use and water management. Moreover, CarrotAge can be used to find out and study crop sequences in large territories, that is a main question for agricultural and environmental research, as discussed in this paper. }
}
@article{Katsaros2005129,
  title = {Fast mining of frequent tree structures by hashing and indexing },
  journal = {Information and Software Technology },
  volume = {47},
  number = {2},
  pages = {129 - 140},
  year = {2005},
  note = {},
  issn = {0950-5849},
  doi = {http://dx.doi.org/10.1016/j.infsof.2004.06.006},
  url = {http://www.sciencedirect.com/science/article/pii/S0950584904000989},
  author = {Dimitrios Katsaros and Alexandros Nanopoulos and Yannis Manolopoulos},
  keywords = {Tree mining},
  keywords = {Hashing},
  keywords = {Semistructured data},
  keywords = {Association rules },
  abstract = {Hierarchical semistructured data arise frequently in the Web, or in biological information processing applications. Semistructured objects describing the same type of information have similar but not identical structure. Usually they share some common ‘schema’. Finding the common schema of a collection of semistructured objects is a very important task and due to the huge amount of such data encountered, data mining techniques have been employed. In this paper, we study the problem of discovering frequently occurring structures in semistructured objects using the notion of association rules. We identify that discovering the frequent structures in the early phases of the mining procedure is the dominant cost and we provide a fast algorithm addressing this issue. We present experimental results, which demonstrate the superiority of the proposed algorithm and also its efficiency in reducing dramatically the processing cost. }
}
@article{Santos2013269,
  title = {A data mining system for providing analytical information on brain tumors to public health decision makers },
  journal = {Computer Methods and Programs in Biomedicine },
  volume = {109},
  number = {3},
  pages = {269 - 282},
  year = {2013},
  note = {},
  issn = {0169-2607},
  doi = {http://dx.doi.org/10.1016/j.cmpb.2012.10.010},
  url = {http://www.sciencedirect.com/science/article/pii/S0169260712002520},
  author = {R.S. Santos and S.M.F. Malheiros and S. Cavalheiro and J.M. Parente de Oliveira},
  keywords = {Data analysis},
  keywords = {Ontology},
  keywords = {Decision support system},
  keywords = {Information system},
  keywords = {Brain neoplasms},
  keywords = {Health services administration },
  abstract = {Cancer is the leading cause of death in economically developed countries and the second leading cause of death in developing countries. Malignant brain neoplasms are among the most devastating and incurable forms of cancer, and their treatment may be excessively complex and costly. Public health decision makers require significant amounts of analytical information to manage public treatment programs for these patients. Data mining, a technology that is used to produce analytically useful information, has been employed successfully with medical data. However, the large-scale adoption of this technique has been limited thus far because it is difficult to use, especially for non-expert users. One way to facilitate data mining by non-expert users is to automate the process. Our aim is to present an automated data mining system that allows public health decision makers to access analytical information regarding brain tumors. The emphasis in this study is the use of ontology in an automated data mining process. The non-experts who tried the system obtained useful information about the treatment of brain tumors. These results suggest that future work should be conducted in this area. }
}
@article{Miele2013524,
  title = {A data-mining approach to preference-based data ranking founded on contextual information },
  journal = {Information Systems },
  volume = {38},
  number = {4},
  pages = {524 - 544},
  year = {2013},
  note = {Special section on \{BPM\} 2011 conference },
  issn = {0306-4379},
  doi = {http://dx.doi.org/10.1016/j.is.2012.12.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0306437912001585},
  author = {Antonio Miele and Elisa Quintarelli and Emanuele Rabosio and Letizia Tanca},
  keywords = {Data personalization},
  keywords = {Context-awareness},
  keywords = {Pervasive computing},
  keywords = {Context-aware preference systems },
  abstract = {The term information overload was already used back in the 1970s by Alvin Toffler in his book Future Shock, and refers to the difficulty to understand and make decisions when too much information is available. In the era of Big Data, this problem becomes much more dramatic, since users may be literally overwhelmed by the cataract of data accessible in the most varied forms. With context-aware data tailoring, given a target application, in each specific context the system allows the user to access only the view which is relevant for that application in that context. Moreover, the relative importance of information to the same user in a different context or, reciprocally, to a different user in the same context, may vary enormously; for this reason, contextual preferences can be used to further refine the views associated with contexts, by imposing a ranking on the data of each context-aware view. In this paper, we propose a methodology and a system, \{PREMINE\} (PREference MINEr), where data mining is adopted to infer contextual preferences from the past interaction of the user with contextual views over a relational database, gathering knowledge in terms of association rules between each context and the relevant data. }
}
@article{Yavaş2005121,
  title = {A data mining approach for location prediction in mobile environments },
  journal = {Data & Knowledge Engineering },
  volume = {54},
  number = {2},
  pages = {121 - 146},
  year = {2005},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2004.09.004},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X04001545},
  author = {Gökhan Yavaş and Dimitrios Katsaros and Özgür Ulusoy and Yannis Manolopoulos},
  keywords = {Location prediction},
  keywords = {Data mining},
  keywords = {Mobile computing},
  keywords = {Mobility patterns},
  keywords = {Mobility prediction },
  abstract = {Mobility prediction is one of the most essential issues that need to be explored for mobility management in mobile computing systems. In this paper, we propose a new algorithm for predicting the next inter-cell movement of a mobile user in a Personal Communication Systems network. In the first phase of our three-phase algorithm, user mobility patterns are mined from the history of mobile user trajectories. In the second phase, mobility rules are extracted from these patterns, and in the last phase, mobility predictions are accomplished by using these rules. The performance of the proposed algorithm is evaluated through simulation as compared to two other prediction methods. The performance results obtained in terms of Precision and Recall indicate that our method can make more accurate predictions than the other methods. }
}
@article{Kwon2013182,
  title = {Heterogeneity of deal proneness: Value-mining, price-mining, and encounters },
  journal = {Journal of Retailing and Consumer Services },
  volume = {20},
  number = {2},
  pages = {182 - 188},
  year = {2013},
  note = {},
  issn = {0969-6989},
  doi = {http://dx.doi.org/10.1016/j.jretconser.2012.11.006},
  url = {http://www.sciencedirect.com/science/article/pii/S0969698912001403},
  author = {Kyoung-Nan Kwon and Yoo Jin Kwon},
  keywords = {Deal proneness},
  keywords = {Human capital},
  keywords = {Consumer behavior },
  abstract = {This study uncovers the heterogeneity of deal proneness in relation to the cost of deal shopping (e.g., time and effort) and shopping capital (i.e., skills and expertise in shopping). The possession of shopping capital determines the cost of deal shopping and how consumers take advantage of deals. Findings of in-depth interviews include: (1) different types of deal shopping—value-mining, price-mining, and encounters, (2) strategies of deal-prone consumers with a high level of shopping capital—creating shortcuts, prospective thinking, and engineering deals, and (3) the sources of deal gratification that result from different goals of deal-prone behaviors. }
}
@article{Rathemacher2013205,
  title = {Developing Issues in Licensing: Text Mining, MOOCs, and More },
  journal = {Serials Review },
  volume = {39},
  number = {3},
  pages = {205 - 210},
  year = {2013},
  note = {},
  issn = {0098-7913},
  doi = {http://dx.doi.org/10.1016/j.serrev.2013.07.016},
  url = {http://www.sciencedirect.com/science/article/pii/S0098791313001263},
  author = {Andrée J. Rathemacher}
}
@article{Lehmann2005143,
  title = {Automatic categorization of medical images for content-based retrieval and data mining },
  journal = {Computerized Medical Imaging and Graphics },
  volume = {29},
  number = {2–3},
  pages = {143 - 155},
  year = {2005},
  note = {Imaging Informatics },
  issn = {0895-6111},
  doi = {http://dx.doi.org/10.1016/j.compmedimag.2004.09.010},
  url = {http://www.sciencedirect.com/science/article/pii/S0895611104001168},
  author = {Thomas M. Lehmann and Mark O. Güld and Thomas Deselaers and Daniel Keysers and Henning Schubert and Klaus Spitzer and Hermann Ney and Berthold B. Wein},
  keywords = {Content-based image retrieval (CBIR)},
  keywords = {Data mining},
  keywords = {Medical imaging},
  keywords = {Pattern recognition},
  keywords = {Feature extraction},
  keywords = {Image categorization},
  keywords = {Texture analysis},
  keywords = {Classifier combination },
  abstract = {Categorization of medical images means selecting the appropriate class for a given image out of a set of pre-defined categories. This is an important step for data mining and content-based image retrieval (CBIR). So far, published approaches are capable to distinguish up to 10 categories. In this paper, we evaluate automatic categorization into more than 80 categories describing the imaging modality and direction as well as the body part and biological system examined. Based on 6231 reference images from hospital routine, 85.5% correctness is obtained combining global texture features with scaled images. With a frequency of 97.7%, the correct class is within the best ten matches, which is sufficient for medical \{CBIR\} applications. }
}
@article{Wang2004365,
  title = {Effective personalized recommendation based on time-framed navigation clustering and association mining },
  journal = {Expert Systems with Applications },
  volume = {27},
  number = {3},
  pages = {365 - 377},
  year = {2004},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2004.05.005},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417404000405},
  author = {Feng-Hsu Wang and Hsiu-Mei Shao},
  keywords = {Personalized recommendation},
  keywords = {Web usage mining},
  keywords = {Clustering},
  keywords = {Association mining},
  keywords = {Collaborative filtering},
  keywords = {Web-based learning environment },
  abstract = {Personalized recommendation by predicting user-browsing behavior using association-mining technology has gained much attention in web personalization research area. However, the resulting association patterns did not perform well in prediction of future browsing patterns due to the low matching rate of the resulting rules and users' browsing behavior. This research proposes a new personalized recommendation method integrating user clustering and association-mining techniques. Historical navigation sessions for each user are divided into frames of sessions based on a specific time interval. This research proposes a new clustering method, called \{HBM\} (Hierarchical Bisecting Medoids Algorithm) to cluster users based on the time-framed navigation sessions. Those navigation sessions of the same group are analyzed using the association-mining method to establish a recommendation model for similar students in the future. Finally, an application of this recommendation method to an e-learning web site is presented, including plans of recommendation policies and proposal of new efficiency measures. The effectiveness of the recommendation methods, with and without time-framed user clustering, are investigated and compared. The results showed that the recommendation model built with user clustering by time-framed navigation sessions improves the recommendation services effectively. }
}
@article{Huang2013183,
  title = {Creating Process-Agents incrementally by mining process asset library },
  journal = {Information Sciences },
  volume = {233},
  number = {0},
  pages = {183 - 199},
  year = {2013},
  note = {},
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2012.12.052},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025513000510},
  author = {Hui Huang and Junchao Xiao and Qiusong Yang and Qing Wang and Hong Wu},
  keywords = {Software process},
  keywords = {Human resource},
  keywords = {Clustering},
  keywords = {Process-Agent},
  keywords = {Trustworthiness },
  abstract = {Software process trustworthiness is the degree of confidence that a software process produces expected trustworthy work products that satisfy requirements. Software processes are dynamic and highly people-dependent. The performance of software processes relies not only on the process itself, but also on the personnel’s capabilities. Therefore, management of human resources and evaluation of a company’s work force capabilities are crucial and will affect software process trustworthiness. Our software process modeling method OEC-SPM (Organization-Entity Capability based Software Process Modeling) has been shown to take into account personnel’s capabilities and groups software developers with certain capabilities into a Process-Agent, which is a way of organizing human resources and process asset libraries in software organizations, and will help to improve trustworthiness of software processes. This paper proposes a novel method for incrementally mining Process-Agents from process asset libraries to support OEC-SPM. The method can automatically and incrementally create Process-Agents under three scenarios with high efficiency. Furthermore, we assess the method with the data from real industry setting. The results show that the utilization of human resources in an organization can be optimized when personnel’s capabilities are taken into account. Additionally, reasonable resource scheduling making use of Process-Agents will result in higher trustworthiness. }
}
@article{Iqbal201398,
  title = {A unified data mining solution for authorship analysis in anonymous textual communications },
  journal = {Information Sciences },
  volume = {231},
  number = {0},
  pages = {98 - 112},
  year = {2013},
  note = {Data Mining for Information Security },
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2011.03.006},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025511001344},
  author = {Farkhund Iqbal and Hamad Binsalleeh and Benjamin C.M. Fung and Mourad Debbabi},
  keywords = {Authorship identification},
  keywords = {Authorship characterization},
  keywords = {Stylometric features},
  keywords = {Writeprint},
  keywords = {Frequent patterns},
  keywords = {Cyber forensics },
  abstract = {The cyber world provides an anonymous environment for criminals to conduct malicious activities such as spamming, sending ransom e-mails, and spreading botnet malware. Often, these activities involve textual communication between a criminal and a victim, or between criminals themselves. The forensic analysis of online textual documents for addressing the anonymity problem called authorship analysis is the focus of most cybercrime investigations. Authorship analysis is the statistical study of linguistic and computational characteristics of the written documents of individuals. This paper is the first work that presents a unified data mining solution to address authorship analysis problems based on the concept of frequent pattern-based writeprint. Extensive experiments on real-life data suggest that our proposed solution can precisely capture the writing styles of individuals. Furthermore, the writeprint is effective to identify the author of an anonymous text from a group of suspects and to infer sociolinguistic characteristics of the author. }
}
@article{Li200471,
  title = {Data mining techniques for cancer detection using serum proteomic profiling },
  journal = {Artificial Intelligence in Medicine },
  volume = {32},
  number = {2},
  pages = {71 - 83},
  year = {2004},
  note = {},
  issn = {0933-3657},
  doi = {http://dx.doi.org/10.1016/j.artmed.2004.03.006},
  url = {http://www.sciencedirect.com/science/article/pii/S0933365704000545},
  author = {Lihua Li and Hong Tang and Zuobao Wu and Jianli Gong and Michael Gruidl and Jun Zou and Melvyn Tockman and Robert A. Clark},
  keywords = {Proteomics},
  keywords = {Cancer detection},
  keywords = {Data mining},
  keywords = {Statistical testing},
  keywords = {Genetic algorithm},
  keywords = {Support vector machine },
  abstract = {Objective: Pathological changes in an organ or tissue may be reflected in proteomic patterns in serum. It is possible that unique serum proteomic patterns could be used to discriminate cancer samples from non-cancer ones. Due to the complexity of proteomic profiling, a higher order analysis such as data mining is needed to uncover the differences in complex proteomic patterns. The objectives of this paper are (1) to briefly review the application of data mining techniques in proteomics for cancer detection/diagnosis; (2) to explore a novel analytic method with different feature selection methods; (3) to compare the results obtained on different datasets and that reported by Petricoin et al. in terms of detection performance and selected proteomic patterns. Methods and material: Three serum \{SELDI\} \{MS\} data sets were used in this research to identify serum proteomic patterns that distinguish the serum of ovarian cancer cases from non-cancer controls. A support vector machine-based method is applied in this study, in which statistical testing and genetic algorithm-based methods are used for feature selection respectively. Leave-one-out cross validation with receiver operating characteristic (ROC) curve is used for evaluation and comparison of cancer detection performance. Results and conclusions: The results showed that (1) data mining techniques can be successfully applied to ovarian cancer detection with a reasonably high performance; (2) the classification using features selected by the genetic algorithm consistently outperformed those selected by statistical testing in terms of accuracy and robustness; (3) the discriminatory features (proteomic patterns) can be very different from one selection method to another. In other words, the pattern selection and its classification efficiency are highly classifier dependent. Therefore, when using data mining techniques, the discrimination of cancer from normal does not depend solely upon the identity and origination of cancer-related proteins. }
}
@article{Gallinucci201587,
  title = {Advanced topic modeling for social business intelligence },
  journal = {Information Systems },
  volume = {53},
  number = {0},
  pages = {87 - 106},
  year = {2015},
  note = {},
  issn = {0306-4379},
  doi = {http://dx.doi.org/10.1016/j.is.2015.04.005},
  url = {http://www.sciencedirect.com/science/article/pii/S0306437915000654},
  author = {Enrico Gallinucci and Matteo Golfarelli and Stefano Rizzi},
  keywords = {Business intelligence},
  keywords = {Social media},
  keywords = {User-generated content},
  keywords = {Multidimensional modeling },
  abstract = {Abstract Social business intelligence combines corporate data with user-generated content (UGC) to make decision-makers aware of the trends perceived from the environment. A key role in the analysis of textual \{UGC\} is played by topics, meant as specific concepts of interest within a subject area. To enable aggregations of topics at different levels, a topic hierarchy has to be defined. Some attempts have been made to address the peculiarities of topic hierarchies, but no comprehensive solution has been found so far. The approach we propose to model topic hierarchies in \{ROLAP\} systems is called meta-stars. Its basic idea is to use meta-modeling coupled with navigation tables and with dimension tables: navigation tables support hierarchy instances with different lengths and with non-leaf facts, and allow different roll-up semantics to be explicitly annotated; meta-modeling enables hierarchy heterogeneity and dynamics to be accommodated; dimension tables are easily integrated with standard business hierarchies. After outlining a reference architecture for social business intelligence and describing the meta-star approach, we formalize its querying expressiveness and give a cost model for the main query execution plans. Then, we evaluate meta-stars by presenting experimental results for query performances and disk space. }
}
@article{Wong2004451,
  title = {Data mining of Bayesian networks using cooperative coevolution },
  journal = {Decision Support Systems },
  volume = {38},
  number = {3},
  pages = {451 - 472},
  year = {2004},
  note = {},
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/S0167-9236(03)00115-5},
  url = {http://www.sciencedirect.com/science/article/pii/S0167923603001155},
  author = {Man Leung Wong and Shing Yan Lee and Kwong Sak Leung},
  keywords = {Cooperative coevolution},
  keywords = {Evolutionary computation},
  keywords = {Data mining},
  keywords = {Bayesian networks },
  abstract = {This paper describes a novel data mining algorithm that employs cooperative coevolution and a hybrid approach to discover Bayesian networks from data. A Bayesian network is a graphical knowledge representation tool. However, learning Bayesian networks from data is a difficult problem. There are two different approaches to the network learning problem. The first one uses dependency analysis, while the second approach searches good network structures according to a metric. Unfortunately, the two approaches both have their own drawbacks. Thus, we propose a novel algorithm that combines the characteristics of these approaches to improve learning effectiveness and efficiency. The new learning algorithm consists of the conditional independence (CI) test and the search phases. In the \{CI\} test phase, dependency analysis is conducted to reduce the size of the search space. In the search phase, good Bayesian networks are generated by a cooperative coevolution genetic algorithm (GA). We conduct a number of experiments and compare the new algorithm with our previous algorithm, Minimum Description Length and Evolutionary Programming (MDLEP), which uses evolutionary programming (EP) for network learning. The results illustrate that the new algorithm has better performance. We apply the algorithm to a large real-world data set and compare the performance of the discovered Bayesian networks with that of the back-propagation neural networks and the logistic regression models. This study illustrates that the algorithm is a promising alternative to other data mining algorithms. }
}
@article{Witten2004137,
  title = {Adaptive text mining: inferring structure from sequences },
  journal = {Journal of Discrete Algorithms },
  volume = {2},
  number = {2},
  pages = {137 - 159},
  year = {2004},
  note = {Combinatiorial Pattern Matching },
  issn = {1570-8667},
  doi = {http://dx.doi.org/10.1016/S1570-8667(03)00084-4},
  url = {http://www.sciencedirect.com/science/article/pii/S1570866703000844},
  author = {I.H. Witten},
  keywords = {Text mining},
  keywords = {Phrase hierarchies},
  keywords = {Keyphrase extraction},
  keywords = {Generic entity extraction},
  keywords = {Text categorization},
  keywords = {Word segmentation},
  keywords = {Acronym extraction},
  keywords = {Compression algorithms},
  keywords = {Adaptive techniques },
  abstract = {Text mining is about inferring structure from sequences representing natural language text, and may be defined as the process of analyzing text to extract information that is useful for particular purposes. Although hand-crafted heuristics are a common practical approach for extracting information from text, a general, and generalizable, approach requires adaptive techniques. This paper studies the way in which the adaptive techniques used in text compression can be applied to text mining. It develops several examples: extraction of hierarchical phrase structures from text, identification of keyphrases in documents, locating proper names and quantities of interest in a piece of text, text categorization, word segmentation, acronym extraction, and structure recognition. We conclude that compression forms a sound unifying principle that allows many text mining problems to be tacked adaptively. }
}
@incollection{Zhao2013105,
  title = {Chapter 10 - Text Mining },
  editor = {Zhao, Yangchang },
  booktitle = {R and Data Mining },
  publisher = {Academic Press},
  edition = {},
  address = {},
  year = {2013},
  pages = {105 - 122},
  isbn = {978-0-12-396963-7},
  doi = {http://dx.doi.org/10.1016/B978-0-12-396963-7.00010-6},
  url = {http://www.sciencedirect.com/science/article/pii/B9780123969637000106},
  author = {Yangchang Zhao},
  abstract = {Abstract This chapter presents examples of text mining with R. Twitter text of @RDataMining is used as the data to analyze. It starts with extracting text from Twitter. The extracted text is then transformed to build a document-term matrix. After that, frequent words and associations are found from the matrix. A word cloud is used to present important words in documents. In the end, words and tweets are clustered to find groups of words and also groups of tweets. Keywords Text mining, Text clustering, Word cloud, Twitter, Stop word, Stemming, Document-term matrix }
}
@article{Jo201380,
  title = {Predicate contrastive topic constructions: Implications for morpho-syntax in Korean and copy theory of movement },
  journal = {Lingua },
  volume = {131},
  number = {0},
  pages = {80 - 111},
  year = {2013},
  note = {},
  issn = {0024-3841},
  doi = {http://dx.doi.org/10.1016/j.lingua.2013.02.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0024384113000466},
  author = {Jung-Min Jo},
  keywords = {Predicate contrastive topic},
  keywords = {Phrasal movement},
  keywords = {Copy theory of movement},
  keywords = {PF-deletion},
  keywords = {Late lexical insertion},
  keywords = {Verbal inflections },
  abstract = {This paper closely examines two types of predicate contrastive topic constructions in Korean, i.e. Ha-construction and R-construction, and argues that they are derived from the same underlying structure via the same process, which involves a phrasal movement including a subject, leaving behind the copy of the moved constituent (cf. Chomsky, 1995), in contrast to the head movement analysis. That is, as far as syntactic process is concerned, Ha- and R-constructions are one and the same construction. Two constructions and their surface variations arise as a result of selective PF-deletion process in the lower copy of the moved constituent. The analysis proposed in this paper provides a uniform analysis of two types of predicate contrastive topic constructions in Korean and also provides a systematic account of both similarities and differences with regard to morpho-syntactic and semantic properties found in two construction types. To the extent the current proposal holds, it provides strong empirical evidence for copy theory of movement and late lexical insertion along the lines of Distributed Morphology (cf. Halle and Marantz, 1993). Furthermore, it constitutes strong empirical evidence for the assumption that verbal inflections as well as verbal roots are independently projected to syntactic structure in Korean. }
}
@article{Hsieh2004623,
  title = {An integrated data mining and behavioral scoring model for analyzing bank customers },
  journal = {Expert Systems with Applications },
  volume = {27},
  number = {4},
  pages = {623 - 633},
  year = {2004},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2004.06.007},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417404000703},
  author = {Nan-Chen Hsieh},
  keywords = {Data mining},
  keywords = {Behavioral scoring model},
  keywords = {Customer segmentation},
  keywords = {Neural network},
  keywords = {Association rule },
  abstract = {Analyzing bank databases for customer behavior management is difficult since bank databases are multi-dimensional, comprised of monthly account records and daily transaction records. This study proposes an integrated data mining and behavioral scoring model to manage existing credit card customers in a bank. A self-organizing map neural network was used to identify groups of customers based on repayment behavior and recency, frequency, monetary behavioral scoring predicators. It also classified bank customers into three major profitable groups of customers. The resulting groups of customers were then profiled by customer's feature attributes determined using an Apriori association rule inducer. This study demonstrates that identifying customers by a behavioral scoring model is helpful characteristics of customer and facilitates marketing strategy development. }
}
@article{Caramia20042387,
  title = {Improving search results with data mining in a thematic search engine },
  journal = {Computers & Operations Research },
  volume = {31},
  number = {14},
  pages = {2387 - 2404},
  year = {2004},
  note = {},
  issn = {0305-0548},
  doi = {http://dx.doi.org/10.1016/S0305-0548(03)00194-1},
  url = {http://www.sciencedirect.com/science/article/pii/S0305054803001941},
  author = {M Caramia and G Felici and A Pezzoli},
  keywords = {Search engines},
  keywords = {Web mining},
  keywords = {Clustering},
  keywords = {Genetic algorithms },
  abstract = {The problem of obtaining relevant results in web searching has been tackled with several approaches. Although very effective techniques are currently used by the most popular search engines when no a priori knowledge on the user's desires beside the search keywords is available, in different settings it is conceivable to design search methods that operate on a thematic database of web pages that refer to a common body of knowledge or to specific sets of users. We have considered such premises to design and develop a search method that deploys data mining and optimization techniques to provide a more significant and restricted set of pages as the final result of a user search. We adopt a vectorization method based on search context and user profile to apply clustering techniques that are then refined by a specially designed genetic algorithm. In this paper we describe the method, its implementation, the algorithms applied, and discuss some experiments that has been run on test sets of web pages. }
}
@article{Tasoulis201387,
  title = {Statistical data mining of streaming motion data for activity and fall recognition in assistive environments },
  journal = {Neurocomputing },
  volume = {107},
  number = {0},
  pages = {87 - 96},
  year = {2013},
  note = {Timely Neural Networks Applications in Engineering Selected Papers from the 12th \{EANN\} International Conference, 2011 },
  issn = {0925-2312},
  doi = {http://dx.doi.org/10.1016/j.neucom.2012.08.036},
  url = {http://www.sciencedirect.com/science/article/pii/S0925231212007606},
  author = {S.K. Tasoulis and C.N. Doukas and V.P. Plagianakos and I. Maglogiannis},
  keywords = {Streaming motion data},
  keywords = {Fall detection},
  keywords = {Visual data},
  keywords = {Cumulative sum (CUSUM) algorithm },
  abstract = {The analysis of human motion data is interesting in the context of activity recognition or emergency event detection, especially in the case of elderly or disabled people living independently in their homes. Several techniques have been proposed for identifying such distress situations using either motion, audio and video sensors on the monitored subject (wearable sensors) or devices installed at the surrounding environment. Visual data captured from the user's environment, using overhead cameras along with motion data, which are collected from accelerometers on the subject's body, can be fed to activity detection systems that can detect emergency situations like falls and injuries. The output of these sensors is data streams that require real time recognition, especially in such emergency situations. In this paper, we study motion and activity related streaming data and we propose classification schemes using traditional classification approaches. However, such approaches may not be always applicable for immediate alarm triggering and fall prevention or when \{CPU\} power and memory resources are limited (e.g. running the detection algorithm on a mobile device such as smartphones). To this end, we also propose a statistical mining methodology that may be used for real time motion data processing. The paper includes details of the stream data analysis methodology incorporated in the activity recognition and fall detection system along with an initial evaluation of the achieved accuracy in detecting falls. The results are promising and indicate that using the proposed methodology real time fall detection is feasible. }
}
@article{Chiang2005311,
  title = {Linear correlation discovery in databases: a data mining approach },
  journal = {Data & Knowledge Engineering },
  volume = {53},
  number = {3},
  pages = {311 - 337},
  year = {2005},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2004.09.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X04001521},
  author = {Roger H.L. Chiang and Chua Eng Huang Cecil and Ee-Peng Lim},
  keywords = {Knowledge discovery in database},
  keywords = {Linear correlation},
  keywords = {Association measurement},
  keywords = {Data mining },
  abstract = {Very little research in knowledge discovery has studied how to incorporate statistical methods to automate linear correlation discovery (LCD). We present an automatic \{LCD\} methodology that adopts statistical measurement functions to discover correlations from databases’ attributes. Our methodology automatically pairs attribute groups having potential linear correlations, measures the linear correlation of each pair of attribute groups, and confirms the discovered correlation. The methodology is evaluated in two sets of experiments. The results demonstrate the methodology’s ability to facilitate linear correlation discovery for databases with a large amount of data. }
}
@article{Yang2004645,
  title = {A text mining approach on automatic generation of web directories and hierarchies },
  journal = {Expert Systems with Applications },
  volume = {27},
  number = {4},
  pages = {645 - 663},
  year = {2004},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2004.06.009},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417404000715},
  author = {Hsin-Chang Yang and Chung-Hong Lee},
  keywords = {World wide web},
  keywords = {Web hierarchy construction},
  keywords = {Web directory construction},
  keywords = {Text mining},
  keywords = {Self-organizing map },
  abstract = {The World Wide Web (WWW) has been recognized as the ultimate and unique source of information for information retrieval and knowledge discovery communities. Tremendous amount of knowledge are recorded using various types of media, producing enormous amount of web pages in the WWW. Retrieval of required information from the \{WWW\} is thus an arduous task. Different schemes for retrieving web pages have been used by the \{WWW\} community. One of the most widely used scheme is to traverse predefined web directories to reach a user's goal. These web directories are compiled or classified folders of web pages and are usually organized into hierarchical structures. The classification of web pages into proper directories and the organization of directory hierarchies are generally performed by human experts. In this work, we provide a corpus-based method that applies a kind of text mining techniques on a corpus of web pages to automatically create web directories and organize them into hierarchies. The method is based on the self-organizing map learning algorithm and requires no human intervention during the construction of web directories and hierarchies. The experiments show that our method can produce comprehensible and reasonable web directories and hierarchies. }
}
@article{vanderAalst2003237,
  title = {Workflow mining: A survey of issues and approaches },
  journal = {Data & Knowledge Engineering },
  volume = {47},
  number = {2},
  pages = {237 - 267},
  year = {2003},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/S0169-023X(03)00066-1},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X03000661},
  author = {W.M.P. van der Aalst and B.F. van Dongen and J. Herbst and L. Maruster and G. Schimm and A.J.M.M. Weijters},
  keywords = {Workflow mining},
  keywords = {Workflow management},
  keywords = {Data mining},
  keywords = {Petri nets },
  abstract = {Many of today’s information systems are driven by explicit process models. Workflow management systems, but also ERP, CRM, SCM, and B2B, are configured on the basis of a workflow model specifying the order in which tasks need to be executed. Creating a workflow design is a complicated time-consuming process and typically there are discrepancies between the actual workflow processes and the processes as perceived by the management. To support the design of workflows, we propose the use of workflow mining. Starting point for workflow mining is a so-called “workflow log” containing information about the workflow process as it is actually being executed. In this paper, we introduce the concept of workflow mining and present a common format for workflow logs. Then we discuss the most challenging problems and present some of the workflow mining approaches available today. }
}
@article{Chau2004219,
  title = {A multilingual text mining approach to web cross-lingual text retrieval },
  journal = {Knowledge-Based Systems },
  volume = {17},
  number = {5–6},
  pages = {219 - 227},
  year = {2004},
  note = {Special Issue: Web Intelligence },
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2004.04.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705104000309},
  author = {Rowena Chau and Chung-Hsing Yeh},
  keywords = {Multilingual text mining},
  keywords = {Cross-lingual text retrieval},
  keywords = {Agent},
  keywords = {Fuzzy clustering},
  keywords = {Fuzzy classification },
  abstract = {To enable concept-based cross-lingual text retrieval (CLTR) using multilingual text mining, our approach will first discover the multilingual concept–term relationships from linguistically diverse textual data relevant to a domain. Second, the multilingual concept–term relationships, in turn, are used to discover the conceptual content of the multilingual text, which is either a document containing potentially relevant information or a query expressing an information need. When language-independent concepts hidden beneath both document and query are revealed, concept-based matching is made possible. Hence, concept-based \{CLTR\} is facilitated. This approach is employed for developing a multi-agent system to facilitate concept-based \{CLTR\} on the Web. }
}
@article{Cambria201341,
  title = {Application of multi-dimensional scaling and artificial neural networks for biologically inspired opinion mining },
  journal = {Biologically Inspired Cognitive Architectures },
  volume = {4},
  number = {0},
  pages = {41 - 53},
  year = {2013},
  note = {},
  issn = {2212-683X},
  doi = {http://dx.doi.org/10.1016/j.bica.2013.02.003},
  url = {http://www.sciencedirect.com/science/article/pii/S2212683X13000212},
  author = {Erik Cambria and Thomas Mazzocco and Amir Hussain},
  keywords = {\{AI\}},
  keywords = {\{NLP\}},
  keywords = {\{ANN\}},
  keywords = {Cognitive modelling},
  keywords = {Sentic computing },
  abstract = {Abstract The way people express their opinions has radically changed in the past few years thanks to the advent of online collaborative media. The distillation of knowledge from this huge amount of unstructured information can be a key factor for marketers who want to create an identity for their product or brand in the minds of their customers. These online social data, however, remain hardly accessible to computers, as they are specifically meant for human consumption. Existing approaches to opinion mining, in fact, are still far from being able to infer the cognitive and affective information associated with natural language as they mainly rely on knowledge bases that are too limited to efficiently process text at concept-level. In this context, standard clustering techniques have been previously employed on an affective common-sense knowledge base in attempt to discover how different natural language concepts are semantically and affectively related to each other and, hence, to accordingly mine on-line opinions. In this work, a novel cognitive model based on the combined use of multi-dimensional scaling and artificial neural networks is exploited for better modelling the way multi-word expressions are organised in a brain-like universe of natural language concepts. The integration of a biologically inspired paradigm with standard principal component analysis helps to better grasp the non-linearities of the resulting vector space and, hence, improve the affective common-sense reasoning capabilities of the system. }
}
@article{Nanopoulos2004365,
  title = {Memory-adaptive association rules mining },
  journal = {Information Systems },
  volume = {29},
  number = {5},
  pages = {365 - 384},
  year = {2004},
  note = {},
  issn = {0306-4379},
  doi = {http://dx.doi.org/10.1016/S0306-4379(03)00035-8},
  url = {http://www.sciencedirect.com/science/article/pii/S0306437903000358},
  author = {Alexandros Nanopoulos and Yannis Manolopoulos},
  keywords = {Association rules},
  keywords = {Memory fluctuations},
  keywords = {Resource management},
  keywords = {Data mining },
  abstract = {New application areas resulted in an increase of the diversity of the workloads that Data Base Management Systems have to confront. Resource management for mixed workloads is attained with the prioritization of their tasks, which during their execution may be forced to release some of their resources. In this paper, we consider workloads that consist of mixtures of \{OLTP\} transactions and association rule mining queries. We propose and evaluate a new scheme for memory-adaptive association rule mining. It is designed to be used in the case of memory fluctuations, which are due to \{OLTP\} transactions that run with higher priority. The proposed scheme uses dynamic adjustment to the provided buffer space. Thus, it avoids the drawbacks of simple but naive approaches; namely the thrashing due to large disk accesses that can be caused by the direct use of virtual memory or long delay times due to suspension. Detailed experimental results, which consider a wide range of factors, indicate the superiority of the proposed scheme. }
}
@article{Liu201390,
  title = {Discovering hot topics from geo-tagged video },
  journal = {Neurocomputing },
  volume = {105},
  number = {0},
  pages = {90 - 99},
  year = {2013},
  note = {Learning for Scalable Multimedia Representation },
  issn = {0925-2312},
  doi = {http://dx.doi.org/10.1016/j.neucom.2012.05.035},
  url = {http://www.sciencedirect.com/science/article/pii/S092523121200817X},
  author = {Kuien Liu and Jiajie Xu and Longfei Zhang and Zhiming Ding and Mingshu Li},
  keywords = {Geo-tagged video},
  keywords = {Hot topic discovery},
  keywords = {Out-of-focus phenomenon },
  abstract = {As video data generated by users boom continuously, making sense of large scale data archives is considered as a critical challenge for data management. Most existing learning techniques that extract signal-level contents from video data struggle to scale due to efficiency limits. With the development of pervasive positioning techniques, discovering hot topics from multimedia data by their geographical tags has become practical: videos taken by advanced cameras are associated with \{GPS\} locations, and geo-tagged videos from YouTube can be identified by their associated \{GPS\} locations on Google Maps. It enables us to know the cultures, scenes, and human behaviors from videos based on their spatio-temporal distributions. However, meaningful topic discovery requires an efficient clustering approach, through which coherent topics can be detected according to particular geographical regions without out-of-focus effects. To handle this problem, this paper presents a filter-refinement framework to discover hot topics corresponding to geographical dense regions, and then introduces two novel metrics to refine unbounded hot regions, together with a heuristic method for setting rational thresholds on these metrics. The results of extensive experiments prove that hot topics can be efficiently discovered by our framework, and more compact topics can be achieved after using the novel metrics. }
}
@article{Hu200469,
  title = {Deriving two-stage learning sequences from knowledge in fuzzy sequential pattern mining },
  journal = {Information Sciences },
  volume = {159},
  number = {1–2},
  pages = {69 - 86},
  year = {2004},
  note = {},
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/S0020-0255(03)00190-7},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025503001907},
  author = {Yi-Chung Hu and Gwo-Hshiung Tzeng and Chin-Mi Chen},
  keywords = {Competence sets},
  keywords = {Data mining},
  keywords = {Sequential patterns},
  keywords = {Fuzzy sets },
  abstract = {A fuzzy sequential pattern consisting of several fuzzy sets represents a frequently occurring behavior related to time and can be discovered from transaction bases. An example is that large purchase amounts of one product were bought by customers after these consumers had bought small purchase amounts of another product. Recently, Hu et al. (2003) proposed a fuzzy data mining method to discover fuzzy sequential patterns. In this method, consumers’ products preferences and consumers’ product buying orders related to purchase behaviors can be found in the fuzzy sequential pattern mining. Since for each decision problem, there is a competence set consisting of ideas, knowledge, information, and skills for solving that problem, we consider knowledge found in fuzzy sequential pattern mining as a needed competence set for solving one decision problem. This paper uses a known competence set expansion method, the minimum spanning table method, to find appropriate two-stage learning sequences that can effectively acquire individual fuzzy knowledge sets found in the fuzzy sequential pattern mining. A numerical example is used to show the usefulness of the proposed method. }
}
@article{Munková2013775,
  title = {Impact of Cognitive Style “Category Width” on the use of Social and Expressive Factors in Politeness Speech Acts: Text Mining Application },
  journal = {Procedia - Social and Behavioral Sciences },
  volume = {82},
  number = {0},
  pages = {775 - 779},
  year = {2013},
  note = {World Conference on Psychology and Sociology 2012 },
  issn = {1877-0428},
  doi = {http://dx.doi.org/10.1016/j.sbspro.2013.06.347},
  url = {http://www.sciencedirect.com/science/article/pii/S1877042813014146},
  author = {Daša Munková and Eva Stranovská and Michal Munk and Beáta Ďuračková},
  keywords = {‘Category width’ Cognitive Style},
  keywords = {Factors of Politeness in Speech Acts of Requests},
  keywords = {Mother Tongue (MT)},
  keywords = {Foreign Language (FL) ; },
  abstract = {Abstract Nowadays, spontaneity, success and coherence of intercultural communication (especially politeness communication) are studied from the point of view of different linguistic theories. However, not enough attention is always paid to cognitive characteristics of the interlocutor. These characteristics as well as context and social specifics of communication influence communication behaviour in foreign language utterance. The aim of our paper is to examine the relationship between cognitive style ‘category width’ and social and expressive factors in politeness speech acts formulated in requests in mother tongue (Slovak) and foreign language (English, Spanish and German). }
}
@article{Lagus2004135,
  title = {Mining massive document collections by the \{WEBSOM\} method },
  journal = {Information Sciences },
  volume = {163},
  number = {1–3},
  pages = {135 - 156},
  year = {2004},
  note = {Soft Computing Data Mining },
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2003.03.017},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025503004195},
  author = {Krista Lagus and Samuel Kaski and Teuvo Kohonen},
  keywords = {Information retrieval},
  keywords = {Self-organizing map (SOM)},
  keywords = {Text mining},
  keywords = {\{WEBSOM\} },
  abstract = {A viable alternative to the traditional text-mining methods is the WEBSOM, a software system based on the Self-Organizing Map (SOM) principle. Prior to the searching or browsing operations, this method orders a collection of textual items, say, documents according to their contents, and maps them onto a regular two-dimensional array of map units. Documents that are similar on the basis of their whole contents will be mapped to the same or neighboring map units, and at each unit there exist links to the document database. Thus, while the searching can be started by locating those documents that match best with the search expression, further relevant search results can be found on the basis of the pointers stored at the same or neighboring map units, even if they did not match the search criterion exactly. This work contains an overview to the \{WEBSOM\} method and its performance, and as a special application, the \{WEBSOM\} map of the texts of Encyclopaedia Britannica is described. }
}
@article{Milne2013222,
  title = {An open-source toolkit for mining Wikipedia },
  journal = {Artificial Intelligence },
  volume = {194},
  number = {0},
  pages = {222 - 239},
  year = {2013},
  note = {Artificial Intelligence, Wikipedia and Semi-Structured Resources },
  issn = {0004-3702},
  doi = {http://dx.doi.org/10.1016/j.artint.2012.06.007},
  url = {http://www.sciencedirect.com/science/article/pii/S000437021200077X},
  author = {David Milne and Ian H. Witten},
  keywords = {Wikipedia},
  keywords = {Toolkit},
  keywords = {Ontology extraction},
  keywords = {Semantic relatedness},
  keywords = {Disambiguation},
  keywords = {Annotation },
  abstract = {The online encyclopedia Wikipedia is a vast, constantly evolving tapestry of interlinked articles. For developers and researchers it represents a giant multilingual database of concepts and semantic relations, a potential resource for natural language processing and many other research areas. This paper introduces the Wikipedia Miner toolkit, an open-source software system that allows researchers and developers to integrate Wikipediaʼs rich semantics into their own applications. The toolkit creates databases that contain summarized versions of Wikipediaʼs content and structure, and includes a Java \{API\} to provide access to them. Wikipediaʼs articles, categories and redirects are represented as classes, and can be efficiently searched, browsed, and iterated over. Advanced features include parallelized processing of Wikipedia dumps, machine-learned semantic relatedness measures and annotation features, and XML-based web services. Wikipedia Miner is intended to be a platform for sharing data mining techniques. }
}
@article{Chen2014495,
  title = {We work with them? Healthcare workers interpretation of organizational relations mined from electronic health records },
  journal = {International Journal of Medical Informatics },
  volume = {83},
  number = {7},
  pages = {495 - 506},
  year = {2014},
  note = {},
  issn = {1386-5056},
  doi = {http://dx.doi.org/10.1016/j.ijmedinf.2014.04.006},
  url = {http://www.sciencedirect.com/science/article/pii/S1386505614000665},
  author = {You Chen and Nancy Lorenzi and Steve Nyemba and Jonathan S. Schildcrout and Bradley Malin},
  keywords = {Electronic health records},
  keywords = {Organizational modeling},
  keywords = {Data mining},
  keywords = {Survey },
  abstract = {AbstractObjective Models of healthcare organizations (HCOs) are often defined up front by a select few administrative officials and managers. However, given the size and complexity of modern healthcare systems, this practice does not scale easily. The goal of this work is to investigate the extent to which organizational relationships can be automatically learned from utilization patterns of electronic health record (EHR) systems. Method We designed an online survey to solicit the perspectives of employees of a large academic medical center. We surveyed employees from two administrative areas: (1) Coding & Charge Entry and (2) Medical Information Services and two clinical areas: (3) Anesthesiology and (4) Psychiatry. To test our hypotheses we selected two administrative units that have work-related responsibilities with electronic records; however, for the clinical areas we selected two disciplines with very different patient responsibilities and whose accesses and people who accessed were similar. We provided each group of employees with questions regarding the chance of interaction between areas in the medical center in the form of association rules (e.g., Given someone from Coding & Charge Entry accessed a patient's record, what is the chance that someone from Medical Information Services access the same record?). We compared the respondent predictions with the rules learned from actual \{EHR\} utilization using linear-mixed effects regression models. Results The findings from our survey confirm that medical center employees can distinguish between association rules of high and non-high likelihood when their own area is involved. Moreover, they can make such distinctions between for any \{HCO\} area in this survey. It was further observed that, with respect to highly likely interactions, respondents from certain areas were significantly better than other respondents at making such distinctions and certain areas’ associations were more distinguishable than others. Conclusions These results illustrate that \{EHR\} utilization patterns may be consistent with the expectations of \{HCO\} employees. Our findings show that certain areas in the \{HCO\} are easier than others for employees to assess, which suggests that automated learning strategies may yield more accurate models of healthcare organizations than those based on the perspectives of a select few individuals. }
}
@article{Zhang2013289,
  title = {Concept extraction and e-commerce applications },
  journal = {Electronic Commerce Research and Applications },
  volume = {12},
  number = {4},
  pages = {289 - 296},
  year = {2013},
  note = {Social Commerce- Part 2 },
  issn = {1567-4223},
  doi = {http://dx.doi.org/10.1016/j.elerap.2013.03.008},
  url = {http://www.sciencedirect.com/science/article/pii/S1567422313000227},
  author = {Yongzheng Zhang and Rajyashree Mukherjee and Benny Soetarman},
  keywords = {Concept extraction},
  keywords = {Automatic keyphrase extraction},
  keywords = {e-Commerce},
  keywords = {Product matching},
  keywords = {Topic-based opinion mining },
  abstract = {Abstract Concept extraction is the technique of mining the most important topic of a document. In the e-commerce context, concept extraction can be used to identify what a shopping related Web page is talking about. This is practically useful in applications like search relevance and product matching. In this paper, we investigate two concept extraction methods: Automatic Concept Extractor (ACE) and Automatic Keyphrase Extraction (KEA). \{ACE\} is an unsupervised method that looks at both text and \{HTML\} tags. We upgrade \{ACE\} into Improved Concept Extractor (ICE) with significant improvements. \{KEA\} is a supervised learning system. We evaluate the methods by comparing automatically generated concepts to a gold standard. The experimental results demonstrate that \{ICE\} significantly outperforms \{ACE\} and also outperforms \{KEA\} in concept extraction. To demonstrate the practical use of concept extraction in the e-commerce context, we use \{ICE\} and \{KEA\} to showcase two e-commerce applications, i.e. product matching and topic-based opinion mining. }
}
@article{Guray2003261,
  title = {Ore-age: a hybrid system for assisting and teaching mining method selection },
  journal = {Expert Systems with Applications },
  volume = {24},
  number = {3},
  pages = {261 - 271},
  year = {2003},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/S0957-4174(02)00154-9},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417402001549},
  author = {Cenk Guray and Nes'e Celebi and Volkan Atalay and A.Gunhan Pasamehmetoglu},
  keywords = {Software agents},
  keywords = {Mining method selection},
  keywords = {Fuzzy logic},
  keywords = {Neuro-fuzzy systems},
  keywords = {Tutoring },
  abstract = {Mining method selection is among the most critical and problematic points in mining engineering profession. Choosing a suitable method for a given ore-body is very important for the economics, safety and the productivity of the mining work. In the past studies there are attempts to build up a systematic approach to help the engineers to make this selection. But, these approaches work based on static databases and fail in inserting the intuitive feelings and engineering judgments of experienced engineers to the selection process. In this study, a system based on 13 different expert systems and one interface agent is developed, to make mining method selection for the given ore-bodies. The agent Ore-Age, to follow his goal of supplying the maximum assistance to engineers in selecting the most suitable method for a specific ore-body, tries to learn the experiences of the experts he has faced. After this learning process the knowledge base is evolved to include these experiences, making the system more efficient and intuitive in mining method selection work. To realize the above goal, the system's tutoring procedure is executed by the agent, in case an inexperienced user enters the system, to complete his/her missing knowledge about mining method selection. }
}
@article{Gieger200397,
  title = {The future of text mining in genome-based clinical research },
  journal = {\{BIOSILICO\} },
  volume = {1},
  number = {3},
  pages = {97 - 102},
  year = {2003},
  note = {},
  issn = {1478-5382},
  doi = {http://dx.doi.org/10.1016/S1478-5382(03)02336-9},
  url = {http://www.sciencedirect.com/science/article/pii/S1478538203023369},
  author = {Christian Gieger and Hartwig Deneke and Juliane Fluck},
  keywords = {Text mining},
  keywords = {information extraction},
  keywords = {information retrieval},
  keywords = {ontology},
  keywords = {molecular biology},
  keywords = {clinical research},
  keywords = {Text mining},
  keywords = {information extraction},
  keywords = {information retrieval},
  keywords = {ontology},
  keywords = {molecular biology},
  keywords = {clinical research },
  abstract = {Efficient information retrieval and extraction is a major challenge in molecular biology and genome-based clinical research. In addition, there is an increasing demand to combine information from different resources and across different disciplines in life sciences. Unfortunately, a large proportion of this information is only available in scientific articles. Moreover, the volume of literature is growing almost exponentially. Text mining provides methods to retrieve and extract information contained in free-text automatically. Here, we discuss the challenges and limitations of text mining in biology and medicine, including unsolved problems and necessary developments. }
}
@article{Chang201345,
  title = {A block mining and re-combination enhanced genetic algorithm for the permutation flowshop scheduling problem },
  journal = {International Journal of Production Economics },
  volume = {141},
  number = {1},
  pages = {45 - 55},
  year = {2013},
  note = {Meta-heuristics for manufacturing scheduling and logistics problems },
  issn = {0925-5273},
  doi = {http://dx.doi.org/10.1016/j.ijpe.2012.06.007},
  url = {http://www.sciencedirect.com/science/article/pii/S0925527312002356},
  author = {Pei-Chann Chang and Wei-Hsiu Huang and Jheng-Long Wu and T.C.E. Cheng},
  keywords = {Building blocks},
  keywords = {Genetic algorithm},
  keywords = {Artificial chromosomes},
  keywords = {Linkage learning},
  keywords = {Re-combination},
  keywords = {Flowshop scheduling },
  abstract = {The goal of block mining is to obtain a set of genes that contain dependency among gene relationships. Such blocks without overlapping of genes can be further merged to form a new chromosome and the quality of the new chromosome can be greatly improved. Based on this concept, we propose a novel block mining method that is able to locate common structures or to establish new blocks (like a small piece of puzzle) from a set of high fit chromosomes. The identified blocks (puzzles) will also be updated generation by generation through the newly updated high fit chromosomes. We develop a heuristic re-combination procedure to form a new chromosome by re-combining the blocks. We call the new chromosomes generated as artificial chromosomes (ACs) and inject them into the evolutionary process when the convergence slope of the evolutionary process is less than a predefined threshold. This new algorithm retains the regular simple genetic algorithm (SGA) operations of crossover and mutation, and utilizes the \{ACs\} generated from elites to speed up the convergence process. Experimental results indicate that the puzzle-based method of chromosome generation is very efficient and effective in solving the traditional permutation flowshop scheduling problem. The new method can be applied to tackle other NP-complete problems such as scheduling and vehicle routing problems. }
}
@incollection{Han2012585,
  title = {13 - Data Mining Trends and Research Frontiers },
  editor = {Kamber, Jiawei HanMicheline  and Pei, Jian },
  booktitle = {Data Mining (Third Edition) },
  publisher = {Morgan Kaufmann},
  edition = {Third Edition},
  address = {Boston},
  year = {2012},
  pages = {585 - 631},
  series = {The Morgan Kaufmann Series in Data Management Systems},
  isbn = {978-0-12-381479-1},
  doi = {http://dx.doi.org/10.1016/B978-0-12-381479-1.00013-7},
  url = {http://www.sciencedirect.com/science/article/pii/B9780123814791000137},
  author = {Jiawei Han and Micheline Kamber and Jian Pei},
  abstract = {Publisher Summary This chapter presents a high-level overview of mining complex data types, which includes mining sequence data such as time series, symbolic sequences, and biological sequences; mining graphs and networks; and mining other kinds of data, including spatiotemporal and cyber-physical system data, multimedia, text and Web data, and data streams. Trends and research frontiers in data mining are focused on. An overview of methodologies for mining complex data types is presented. Such mining includes mining time-series, sequential patterns, and biological sequences; graphs and networks; spatiotemporal data, including geospatial data, moving-object data, and cyber-physical system data; multimedia data; text data; web data; and data streams. Other approaches to data mining, including statistical methods, theoretical foundations, and visual and audio data mining are briefly introduced. Several well-established statistical methods have been proposed for data analysis such as regression, generalized linear models, analysis of variance, mixed-effect models, factor analysis, discriminant analysis, survival analysis, and quality control. Data mining applications in business and in science, including the financial retail, and telecommunication industries, science and engineering, and recommender systems are introduced. The social impacts of data mining are discussed, including ubiquitous and invisible data mining, and privacy-preserving data mining. Finally, current and expected data mining trends that arise in response to new challenges in the field e speculated. }
}
@article{Cuzzocrea2013309,
  title = {Theoretical and practical aspects of warehousing, querying and mining sensor and streaming data },
  journal = {Journal of Computer and System Sciences },
  volume = {79},
  number = {3},
  pages = {309 - 311},
  year = {2013},
  note = {Theoretical and Practical Aspects of Warehousing, Querying and Mining Sensor and Streaming Data },
  issn = {0022-0000},
  doi = {http://dx.doi.org/10.1016/j.jcss.2012.09.008},
  url = {http://www.sciencedirect.com/science/article/pii/S0022000012001444},
  author = {Alfredo Cuzzocrea}
}
@article{Albar20021751,
  title = {A bucket wheel dredge system for offshore tin mining beyond the 50 m water depth },
  journal = {Ocean Engineering },
  volume = {29},
  number = {14},
  pages = {1751 - 1767},
  year = {2002},
  note = {},
  issn = {0029-8018},
  doi = {http://dx.doi.org/10.1016/S0029-8018(02)00003-3},
  url = {http://www.sciencedirect.com/science/article/pii/S0029801802000033},
  author = {A Albar and R.E Randall and B Dwibarto and B.L Edge},
  keywords = {Tin mining},
  keywords = {Tin-ore physical properties},
  keywords = {Bucket wheel dredge},
  keywords = {Hydraulic transport},
  keywords = {Dredge simulation },
  abstract = {A bucket wheel dredge (BWD) for offshore tin mining is part of the long term plan of \{PT\} Timah Tbk to identify new dredging technologies for mining in greater than 50 m water depths. Measured tin-ore physical properties are used in a new \{BWD\} computer model to investigate deep water tin mining. The model simulates the cutting and hydraulic transport of submerged tin-ore. The results show that hydraulic tin mining beyond water depths of 50 m is feasible. It is best to employ hydraulic transport for lifting the tin-ore from the ocean floor to the surface and to use barges to transport the tin-ore to land processing plants. }
}
@article{Ampatzoglou2013131,
  title = {Building and mining a repository of design pattern instances: Practical and research benefits },
  journal = {Entertainment Computing },
  volume = {4},
  number = {2},
  pages = {131 - 142},
  year = {2013},
  note = {},
  issn = {1875-9521},
  doi = {http://dx.doi.org/10.1016/j.entcom.2012.10.002},
  url = {http://www.sciencedirect.com/science/article/pii/S1875952112000195},
  author = {Apostolos Ampatzoglou and Olia Michou and Ioannis Stamelos},
  keywords = {Software engineering},
  keywords = {Computer games},
  keywords = {Design patterns},
  keywords = {Repository },
  abstract = {Design patterns are well-known design solutions that are reported to produce substantial benefits with respect to software quality. However, to our knowledge there are no scientific efforts on gathering information on software projects that use design patterns. This paper introduces a web repository of design patterns instances that have been used in open source projects. The usefulness of such a repository lies in the provision of a base of knowledge, where developers can identify reusable components and researchers can find a mined data set. Currently, 141 open source projects have been considered and more than 4500 pattern instances have been found and recorded in the database of the repository. The evaluation of the repository has been performed from an academic and a practical point of view. The results suggest that the repository can be useful for both experienced and inexperienced users. However, the benefits of using the repository are more significant for inexperienced users. }
}
@article{Feldman200369,
  title = {Mining the biomedical literature using semantic analysis and natural language processing techniques },
  journal = {\{BIOSILICO\} },
  volume = {1},
  number = {2},
  pages = {69 - 80},
  year = {2003},
  note = {},
  issn = {1478-5382},
  doi = {http://dx.doi.org/10.1016/S1478-5382(03)02330-8},
  url = {http://www.sciencedirect.com/science/article/pii/S1478538203023308},
  author = {Ronen Feldman and Yizhar Regev and Eyal Hurvitz and Michal Finkelstein-Landau},
  keywords = {text mining},
  keywords = {biomedical literature databases},
  keywords = {information extraction},
  keywords = {Lexicon-based entities},
  keywords = {Machine-assisted indexing },
  abstract = {The information age has made the electronic storage of large amounts of data effortless. The proliferation of documents available on the Internet, corporate intranets, news wires and elsewhere is overwhelming. Search engines only exacerbate this overload problem by making increasingly more documents available in only a few keystrokes. This information overload also exists in the biomedical field, where scientific publications, and other forms of text-based data are produced at an unprecedented rate. Text mining is the combined, automated process of analyzing unstructured, natural language text to discover information and knowledge that are typically difficult to retrieve. Here, we focus on text mining as applied to the biomedical literature. We focus in particular on finding relationships among genes, proteins, drugs and diseases, to facilitate an understanding and prediction of complex biological processes. The LitMiner™ system, developed specifically for this purpose; is described in relation to the Knowledge Discovery and Data Mining Cup 2002, which serves as a formal evaluation of the system. }
}
@article{Rygielski2002483,
  title = {Data mining techniques for customer relationship management },
  journal = {Technology in Society },
  volume = {24},
  number = {4},
  pages = {483 - 502},
  year = {2002},
  note = {},
  issn = {0160-791X},
  doi = {http://dx.doi.org/10.1016/S0160-791X(02)00038-6},
  url = {http://www.sciencedirect.com/science/article/pii/S0160791X02000386},
  author = {Chris Rygielski and Jyun-Cheng Wang and David C. Yen},
  keywords = {Customer relationship management (CRM)},
  keywords = {Relationship marketing},
  keywords = {Data mining},
  keywords = {Neural networks},
  keywords = {Chi-square automated interaction detection (CHAID)},
  keywords = {Privacy rights },
  abstract = {Advancements in technology have made relationship marketing a reality in recent years. Technologies such as data warehousing, data mining, and campaign management software have made customer relationship management a new area where firms can gain a competitive advantage. Particularly through data mining—the extraction of hidden predictive information from large databases—organizations can identify valuable customers, predict future behaviors, and enable firms to make proactive, knowledge-driven decisions. The automated, future-oriented analyses made possible by data mining move beyond the analyses of past events typically provided by history-oriented tools such as decision support systems. Data mining tools answer business questions that in the past were too time-consuming to pursue. Yet, it is the answers to these questions make customer relationship management possible. Various techniques exist among data mining software, each with their own advantages and challenges for different types of applications. A particular dichotomy exists between neural networks and chi-square automated interaction detection (CHAID). While differing approaches abound in the realm of data mining, the use of some type of data mining is necessary to accomplish the goals of today’s customer relationship management philosophy. }
}
@article{Heinrichs2003103,
  title = {Integrating web-based data mining tools with business models for knowledge management },
  journal = {Decision Support Systems },
  volume = {35},
  number = {1},
  pages = {103 - 112},
  year = {2003},
  note = {Web Retrieval and Mining },
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/S0167-9236(02)00098-2},
  url = {http://www.sciencedirect.com/science/article/pii/S0167923602000982},
  author = {John H. Heinrichs and Jeen-Su Lim},
  keywords = {Web-based data mining},
  keywords = {Knowledge management},
  keywords = {Business models},
  keywords = {Business intelligence},
  keywords = {Business insights},
  keywords = {Competitive advantage },
  abstract = {As firms begin to implement web-based presentation and data mining tools to enhance decision support capability, the firm's knowledge workers must determine how to most effectively use these new web-based tools to deliver competitive advantage. The focus of this study is on evaluating how knowledge workers integrate these tools into their information and knowledge management requirements. The relationship between the independent variables (web-based data mining software tools and business models) and the dependent variable (strategic performance capabilities) is empirically tested in this study. The results from this study demonstrate the positive interaction effect between the tools and models application on strategic performance capability. }
}
@article{Hu2003709,
  title = {Elicitation of classification rules by fuzzy data mining },
  journal = {Engineering Applications of Artificial Intelligence },
  volume = {16},
  number = {7–8},
  pages = {709 - 716},
  year = {2003},
  note = {},
  issn = {0952-1976},
  doi = {http://dx.doi.org/10.1016/j.engappai.2003.09.007},
  url = {http://www.sciencedirect.com/science/article/pii/S0952197603001155},
  author = {Yi-Chung Hu and Gwo-Hshiung Tzeng},
  keywords = {Data mining},
  keywords = {Fuzzy sets},
  keywords = {Classification problems},
  keywords = {Association rules },
  abstract = {Data mining techniques can be used to find potentially useful patterns from data and to ease the knowledge acquisition bottleneck in building prototype rule-based systems. Based on the partition methods presented in simple-fuzzy-partition-based method (SFPBM) proposed by Hu et al. (Comput. Ind. Eng. 43(4) (2002) 735), the aim of this paper is to propose a new fuzzy data mining technique consisting of two phases to find fuzzy if–then rules for classification problems: one to find frequent fuzzy grids by using a pre-specified simple fuzzy partition method to divide each quantitative attribute, and the other to generate fuzzy classification rules from frequent fuzzy grids. To improve the classification performance of the proposed method, we specially incorporate adaptive rules proposed by Nozaki et al. (IEEE Trans. Fuzzy Syst. 4(3) (1996) 238) into our methods to adjust the confidence of each classification rule. For classification generalization ability, the simulation results from the iris data demonstrate that the proposed method may effectively derive fuzzy classification rules from training samples. }
}
@article{Xianghua2013186,
  title = {Multi-aspect sentiment analysis for Chinese online social reviews based on topic modeling and HowNet lexicon },
  journal = {Knowledge-Based Systems },
  volume = {37},
  number = {0},
  pages = {186 - 195},
  year = {2013},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2012.08.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705112002158},
  author = {Fu Xianghua and Liu Guo and Guo Yanyan and Wang Zhiqiang},
  keywords = {Aspect detection},
  keywords = {Sentiment analysis},
  keywords = {Social reviews},
  keywords = {Topic modeling},
  keywords = {HowNet lexicon },
  abstract = {User-generated reviews on the Web reflect users’ sentiment about products, services and social events. Existing researches mostly focus on the sentiment classification of the product and service reviews in document level. Reviews of social events such as economic and political activities, which are called social reviews, have specific characteristics different to the reviews of products and services. In this paper, we propose an unsupervised approach to automatically discover the aspects discussed in Chinese social reviews and also the sentiments expressed in different aspects. The approach is called Multi-aspect Sentiment Analysis for Chinese Online Social Reviews (MSA-COSRs). We first apply the Latent Dirichlet Allocation (LDA) model to discover multi-aspect global topics of social reviews, and then extract the local topic and associated sentiment based on a sliding window context over the review text. The aspect of the local topic is identified by a trained \{LDA\} model, and the polarity of the associated sentiment is classified by HowNet lexicon. The experiment results show that MSA-COSR cannot only obtain good topic partitioning results, but also help to improve sentiment analysis accuracy. It helps to simultaneously discover multi-aspect fine-grained topics and associated sentiment. }
}
@article{Tseng201210082,
  title = {Mining term networks from text collections for crime investigation },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {11},
  pages = {10082 - 10090},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.02.052},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412002965},
  author = {Yuen-Hsien Tseng and Zih-Ping Ho and Kai-Sheng Yang and Chun-Cheng Chen},
  keywords = {Co-occurrence analysis},
  keywords = {Term relations},
  keywords = {Visualization},
  keywords = {Network analysis},
  keywords = {Knowledge discovery },
  abstract = {An efficient term mining method to build a general term network is presented. The resulting term network can be used for entity relation visualization and exploration, which is useful in many text-mining applications such as crime exploration and investigation from vast piles of crime news or official criminal records. In the proposed method, terms from each document in a text collection are first identified. They are subjected to an analysis for pairwise association weights. The weights are then accumulated over all the documents to obtain final similarity for each term pair. Based on the resulting term similarity, a general term network for the collection is built with terms as nodes and non-zero similarities as links. In application, a list of predefined terms having similar attributes was selected to extract the desired sub-network from the general term network for entity relation visualization. This text analysis scenario based on the collective terms of the similar type or from the same topic enables evidence-based relation exploration. Some practical instances of crime exploration and investigation are demonstrated. Our application examples show that term relations, be it causality, subordination, coupling, or others, can be effectively revealed by our method and easily verified by the underlying text collection. This work contributes by presenting an integrated term-relationship mining and exploration approach and demonstrating the feasibility of the term network to the increasingly important application of crime exploration and investigation. }
}
@article{Scharl2004229,
  title = {Mining large samples of web-based corpora },
  journal = {Knowledge-Based Systems },
  volume = {17},
  number = {5–6},
  pages = {229 - 233},
  year = {2004},
  note = {Special Issue: Web Intelligence },
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2004.04.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705104000310},
  author = {Arno Scharl and Christian Bauer},
  keywords = {Web mining},
  keywords = {Content analysis},
  keywords = {Renewable Energy},
  keywords = {Online media },
  abstract = {This paper presents a method to automatically mirror, process, and compare large samples of text corpora from Web-based information systems. The wealth of textual information contained in publicly available Web sites is converted into aggregated representations through textual analysis. The application of word lists, keyword analysis, term clustering, and correspondence analyses to identify and represent semantic relationships, including their longitudinal patterns, is illustrated through a case study that investigates the global coverage of solar power technologies in international media. The resulting graphs, indicators and tables describe complex relationships and developments that are hard to capture in traditional ways. As such they facilitate investigations about the nature and dynamics of Web content. }
}
@article{Guralnik2004443,
  title = {Parallel tree-projection-based sequence mining algorithms },
  journal = {Parallel Computing },
  volume = {30},
  number = {4},
  pages = {443 - 472},
  year = {2004},
  note = {},
  issn = {0167-8191},
  doi = {http://dx.doi.org/10.1016/j.parco.2004.03.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0167819104000456},
  author = {Valerie Guralnik and George Karypis},
  keywords = {Frequent sequential patterns},
  keywords = {Database projection algorithms},
  keywords = {Data mining },
  abstract = {Discovery of sequential patterns is becoming increasingly useful and essential in many scientific and commercial domains. Enormous sizes of available datasets and possibly large number of mined patterns demand efficient, scalable, and parallel algorithms. Even though a number of algorithms have been developed to efficiently parallelize frequent pattern discovery algorithms that are based on the candidate-generation-and-counting framework, the problem of parallelizing the more efficient projection-based algorithms has received relatively little attention and existing parallel formulations have been targeted only toward shared-memory architectures. The irregular and unstructured nature of the task-graph generated by these algorithms and the fact that these tasks operate on overlapping sub-databases makes it challenging to efficiently parallelize these algorithms on scalable distributed-memory parallel computing architectures. In this paper we present and study a variety of distributed-memory parallel algorithms for a tree-projection-based frequent sequence discovery algorithm that are able to minimize the various overheads associated with load imbalance, database overlap, and interprocessor communication. Our experimental evaluation on a 32 processor \{IBM\} \{SP\} show that these algorithms are capable of achieving good speedups, substantially reducing the amount of the required work to find sequential patterns in large databases. }
}
@article{Symeonidis2003589,
  title = {Intelligent policy recommendations on enterprise resource planning by the use of agent technology and data mining techniques },
  journal = {Expert Systems with Applications },
  volume = {25},
  number = {4},
  pages = {589 - 602},
  year = {2003},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/S0957-4174(03)00099-X},
  url = {http://www.sciencedirect.com/science/article/pii/S095741740300099X},
  author = {Andreas L. Symeonidis and Dionisis D. Kehagias and Pericles A. Mitkas},
  keywords = {Supply Chain Management},
  keywords = {Customer Relationship Management},
  keywords = {Data mining},
  keywords = {Multi-agent systems},
  keywords = {Agent training },
  abstract = {Enterprise Resource Planning systems tend to deploy Supply Chain Management and/or Customer Relationship Management techniques, in order to successfully fuse information to customers, suppliers, manufacturers and warehouses, and therefore minimize system-wide costs while satisfying service level requirements. Although efficient, these systems are neither versatile nor adaptive, since newly discovered customer trends cannot be easily integrated with existing knowledge. Advancing on the way the above mentioned techniques apply on \{ERP\} systems, we have developed a multi-agent system that introduces adaptive intelligence as a powerful add-on for \{ERP\} software customization. The system can be thought of as a recommendation engine, which takes advantage of knowledge gained through the use of data mining techniques, and incorporates it into the resulting company selling policy. The intelligent agents of the system can be periodically retrained as new information is added to the ERP. In this paper, we present the architecture and development details of the system, and demonstrate its application on a real test case. }
}
@article{Wu2003401,
  title = {Data mining applied to material acquisition budget allocation for libraries: design and development },
  journal = {Expert Systems with Applications },
  volume = {25},
  number = {3},
  pages = {401 - 411},
  year = {2003},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/S0957-4174(03)00065-4},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417403000654},
  author = {C.-H Wu},
  keywords = {Acquisition budget allocation},
  keywords = {Circulation},
  keywords = {Data mining },
  abstract = {Library management frequently faces the need of making the value of the acquired materials significant as far as the most beneficial use of the allocated acquisition budget is concerned. Knowledge in the circulation databases can be explored in-depth to relevantly reflect this need. In this paper, a data mining based model (DMBA) is designed and developed to help allocate the library material acquisition budget by opening up the utilization of library materials that users have made use. The developed model is based on the feature of \{ID3\} algorithm to explore explanatory knowledge via information theory and statistics to derive appropriateness via utilization gain. The main output of the \{DMBA\} is the weights as the basis of library material acquisition budget allocation for departments via the combination of explored explanatory knowledge and appropriateness. The developed \{DMBA\} was supported by a practical application case. }
}
@article{Roussinov2003149,
  title = {Automatic discovery of similarity relationships through Web mining },
  journal = {Decision Support Systems },
  volume = {35},
  number = {1},
  pages = {149 - 166},
  year = {2003},
  note = {Web Retrieval and Mining },
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/S0167-9236(02)00102-1},
  url = {http://www.sciencedirect.com/science/article/pii/S0167923602001021},
  author = {Dmitri Roussinov and J.Leon Zhao},
  keywords = {Data mining},
  keywords = {Context sensitive similarity discovery},
  keywords = {Empirical study},
  keywords = {Group decision support systems},
  keywords = {Internet},
  keywords = {Machine learning},
  keywords = {Organizational concept space},
  keywords = {Text clustering},
  keywords = {Web mining },
  abstract = {This work demonstrates how the World Wide Web can be mined in a fully automated manner for discovering the semantic similarity relationships among the concepts surfaced during an electronic brainstorming session, and thus improving the accuracy of automated clustering meeting messages. Our novel Context Sensitive Similarity Discovery (CSSD) method takes advantage of the meeting context when selecting a subset of Web pages for data mining, and then conducts regular concept co-occurrence analysis within that subset. Our results have implications on reducing information overload in applications of text technologies such as email filtering, document retrieval, text summarization, and knowledge management. }
}
@article{He2013117,
  title = {Mining diversity subgraph in multidisciplinary scientific collaboration networks: A meso perspective },
  journal = {Journal of Informetrics },
  volume = {7},
  number = {1},
  pages = {117 - 128},
  year = {2013},
  note = {},
  issn = {1751-1577},
  doi = {http://dx.doi.org/10.1016/j.joi.2012.09.005},
  url = {http://www.sciencedirect.com/science/article/pii/S1751157712000806},
  author = {Bing He and Ying Ding and Jie Tang and Vignesh Reguramalingam and Johan Bollen},
  keywords = {Scientific collaboration},
  keywords = {Network analysis},
  keywords = {Subgraph detection },
  abstract = {This paper proposes a framework to analyze the interdisciplinary collaboration in a coauthorship network from a meso perspective using topic modeling: (1) a customized topic model is developed to capture and formalize the interdisciplinary feature; and (2) the two algorithms Diversity Subgraph Extraction (DSE) and Constraint-based Diversity Subgraph Extraction (CDSE) are designed and implemented to extract a meso view, i.e. a diversity subgraph of the interdisciplinary collaboration. The proposed framework is demonstrated using a coauthorship network in the field of computer science. A comparison between \{DSE\} and Breadth First Search (BSF)-based subgraph extraction favors \{DSE\} in capturing the diversity in interdisciplinary collaboration. Potential possibilities for studying various research topics based on the proposed framework of analysis are discussed. }
}
@article{Singh2013212,
  title = {Computational Exploration of Theme-based Blog Data Using Topic Modeling, \{NERC\} and Sentiment Classifier Combine },
  journal = {\{AASRI\} Procedia },
  volume = {4},
  number = {0},
  pages = {212 - 222},
  year = {2013},
  note = {2013 \{AASRI\} Conference on Intelligent Systems and Control },
  issn = {2212-6716},
  doi = {http://dx.doi.org/10.1016/j.aasri.2013.10.033},
  url = {http://www.sciencedirect.com/science/article/pii/S2212671613000346},
  author = {V.K. Singh and P. Waila and R. Piryani and A. Uddin},
  keywords = {Social Media},
  keywords = {Text Analytics},
  keywords = {Topic Modeling},
  keywords = {Named Entity Recognition},
  keywords = {Sentiment Classification },
  abstract = {Abstract This paper presents findings of our exploratory research work on a novel combine of Topic Modeling, Named Entity Recognition and Sentiment Classification for sociological analysis of blog data. We have collected more than 500 blog posts on the broader theme of ‘Discrimination, Abuse and Crime against Women’. We employed topic discovery to identify top keywords and key themes and implemented the 7-entity model Named Entity Recognition process to identify the key persons, organizations and locations discussed in the blog posts. Thereafter we performed sentiment classification of the entire blog data into positive and negative categories using SentiWordNet. The results obtained are very interesting and validate the usefulness of our approach for computational analysis of social media data. The key contribution of the paper is to propose a novel Text Analytics combine and demonstrate its applicability for computational exploration of the social media data for sociological analysis purposes. }
}
@incollection{Han2012279,
  title = {7 - Advanced Pattern Mining },
  editor = {Kamber, Jiawei HanMicheline  and Pei, Jian },
  booktitle = {Data Mining (Third Edition) },
  publisher = {Morgan Kaufmann},
  edition = {Third Edition},
  address = {Boston},
  year = {2012},
  pages = {279 - 325},
  series = {The Morgan Kaufmann Series in Data Management Systems},
  isbn = {978-0-12-381479-1},
  doi = {http://dx.doi.org/10.1016/B978-0-12-381479-1.00007-1},
  url = {http://www.sciencedirect.com/science/article/pii/B9780123814791000071},
  author = {Jiawei Han and Micheline Kamber and Jian Pei},
  abstract = {Publisher Summary This chapter discusses the advanced methods of frequent pattern mining, which mines more complex forms of frequent patterns and considers user preferences or constraints to speed up the mining process. Frequent pattern mining has reached far beyond the basics due to substantial research, numerous extensions of the problem scope, and broad application studies. An in-depth coverage of methods for mining many kinds of patterns is included elaborating on: multilevel patterns, multidimensional patterns, patterns in continuous data, rare patterns, negative patterns, constrained frequent patterns, frequent patterns in high-dimensional data, colossal patterns, and compressed and approximate patterns. Other pattern mining themes, including mining sequential and structured patterns and mining patterns from spatiotemporal, multimedia, and stream data, are considered more advanced. Pattern mining is a more general term than frequent pattern mining since the former covers rare and negative patterns as well. However, when there is no ambiguity, the two terms are used interchangeably. In addition to mining for basic frequent itemsets and associations, advanced forms of patterns can be mined such as multilevel associations and multidimensional associations, quantitative association rules, rare patterns, and negative patterns. Users can also mine high-dimensional patterns and compressed or approximate patterns. Frequent pattern mining has many diverse applications, ranging from pattern-based data cleaning to pattern-based classification, clustering, and outlier or exception analysis. }
}
@article{Hu2003509,
  title = {Finding fuzzy classification rules using data mining techniques },
  journal = {Pattern Recognition Letters },
  volume = {24},
  number = {1–3},
  pages = {509 - 519},
  year = {2003},
  note = {},
  issn = {0167-8655},
  doi = {http://dx.doi.org/10.1016/S0167-8655(02)00273-8},
  url = {http://www.sciencedirect.com/science/article/pii/S0167865502002738},
  author = {Yi-Chung Hu and Ruey-Shun Chen and Gwo-Hshiung Tzeng},
  keywords = {Data mining},
  keywords = {Fuzzy sets},
  keywords = {Classification problems},
  keywords = {Genetic algorithms },
  abstract = {Data mining techniques can be used to discover useful patterns by exploring and analyzing data, so, it is feasible to incorporate data mining techniques into the classification process to discover useful patterns or classification rules from training samples. This paper thus proposes a data mining technique to discover fuzzy classification rules based on the well-known Apriori algorithm. Significantly, since it is difficult for users to specify the minimum fuzzy support used to determine the frequent fuzzy grids or the minimum fuzzy confidence used to determine the effective classification rules derived from frequent fuzzy grids, therefore the genetic algorithms are incorporated into the proposed method to determine those two thresholds with binary chromosomes. For classification generalization ability, the simulation results from the iris data and the appendicitis data demonstrate that the proposed method performs well in comparison with other classification methods. }
}
@article{Maks2012680,
  title = {A lexicon model for deep sentiment analysis and opinion mining applications },
  journal = {Decision Support Systems },
  volume = {53},
  number = {4},
  pages = {680 - 688},
  year = {2012},
  note = {1) Computational Approaches to Subjectivity and Sentiment Analysis 2) Service Science in Information Systems Research : Special Issue on \{PACIS\} 2010 },
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/j.dss.2012.05.025},
  url = {http://www.sciencedirect.com/science/article/pii/S0167923612001364},
  author = {Isa Maks and Piek Vossen},
  keywords = {Sentiment analysis},
  keywords = {Attitude analysis},
  keywords = {Subjectivity lexicon model},
  keywords = {Speaker/writer perspective },
  abstract = {This paper presents a lexicon model for the description of verbs, nouns and adjectives to be used in applications like sentiment analysis and opinion mining. The model aims to describe the detailed subjectivity relations that exist between the actors in a sentence expressing separate attitudes for each actor. Subjectivity relations that exist between the different actors are labeled with information concerning both the identity of the attitude holder and the orientation (positive vs. negative) of the attitude. The model includes a categorization into semantic categories relevant to opinion mining and sentiment analysis and provides means for the identification of the attitude holder and the polarity of the attitude and for the description of the emotions and sentiments of the different actors involved in the text. Special attention is paid to the role of the speaker/writer of the text whose perspective is expressed and whose views on what is happening are conveyed in the text. Finally, validation is provided by an annotation study that shows that these subtle subjectivity relations are reliably identifiable by human annotators. }
}
@article{Cuzzocrea2013281,
  title = {Models and algorithms for high-performance distributed data mining },
  journal = {Journal of Parallel and Distributed Computing },
  volume = {73},
  number = {3},
  pages = {281 - 283},
  year = {2013},
  note = {Models and Algorithms for High-Performance Distributed Data Mining },
  issn = {0743-7315},
  doi = {http://dx.doi.org/10.1016/j.jpdc.2012.11.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0743731512002717},
  author = {Alfredo Cuzzocrea}
}
@article{Gamberger200327,
  title = {Active subgroup mining: a case study in coronary heart disease risk group detection },
  journal = {Artificial Intelligence in Medicine },
  volume = {28},
  number = {1},
  pages = {27 - 57},
  year = {2003},
  note = {},
  issn = {0933-3657},
  doi = {http://dx.doi.org/10.1016/S0933-3657(03)00034-4},
  url = {http://www.sciencedirect.com/science/article/pii/S0933365703000344},
  author = {Dragan Gamberger and Nada Lavrač and Goran Krstačić},
  keywords = {Coronary heart disease},
  keywords = {Active mining},
  keywords = {Machine learning},
  keywords = {Subgroup discovery},
  keywords = {Risk group detection},
  keywords = {Non-invasive cardiovascular tests },
  abstract = {This paper presents an approach to active mining of patient records aimed at discovering patient groups at high risk for coronary heart disease (CHD). The approach proposes active expert involvement in the following steps of the knowledge discovery process: data gathering, cleaning and transformation, subgroup discovery, statistical characterization of induced subgroups, their interpretation, and the evaluation of results. As in the discovery and characterization of risk subgroups, the main risk factors are made explicit, the proposed methodology has high potential for patient screening and early detection of patient groups at risk for CHD. }
}
@article{Lee2002197,
  title = {Fuzzy cognitive map approach to web-mining inference amplification },
  journal = {Expert Systems with Applications },
  volume = {22},
  number = {3},
  pages = {197 - 211},
  year = {2002},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/S0957-4174(01)00054-9},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417401000549},
  author = {Kun Chang Lee and Jin Sung Kim and Nam Ho Chung and Soon Jae Kwon},
  keywords = {Web-mining},
  keywords = {Knowledge-base},
  keywords = {Causal knowledge},
  keywords = {Fuzzy cognitive map },
  abstract = {This paper is concerned with proposing the fuzzy cognitive map (FCM)-driven inference amplification mechanism in the field of web-mining. As the recent advent of the Internet, most of the modern firms are now geared towards using the web technology in their daily as well as strategic activities. The web-mining technology provides them with unprecedented ability to analyze web-log data, which are seemingly full of useful information, but often lack of important and meaningful information. This indicates the need to develop an advanced inference mechanism extracting richer implication from the web-mining results. In this sense, we propose a new web-mining inference amplification (WEMIA) mechanism using the inference logic of FCM. The association rule mining is what we adopt as the web-mining technique to prove the validity of the proposed WEMIA. The main recipe of the proposed \{WEMIA\} is the three-phased inference amplification. The first phase is to apply the association rule mining, and the second phase is to transform the association rules into FCM-driven causal knowledge bases. The third phase is dedicated to amplifying the inference by developing the causal knowledge-based inference equivalence property, which was derived from analyzing the inference mechanism of FCMs. With an illustrative web-log database, we suggest results proving the robustness of our proposed \{WEMIA\} mechanism. }
}
@article{Yang2012S16,
  title = {Target discovery from data mining approaches },
  journal = {Drug Discovery Today },
  volume = {17, Supplement},
  number = {0},
  pages = {S16 - S23},
  year = {2012},
  note = {Strategic Approach to Target Identification and Validation: A Supplement to Drug Discovery Today },
  issn = {1359-6446},
  doi = {http://dx.doi.org/10.1016/j.drudis.2011.12.006},
  url = {http://www.sciencedirect.com/science/article/pii/S1359644611004338},
  author = {Yongliang Yang and S. James Adelstein and Amin I. Kassis},
  abstract = {Data mining of available biomedical data and information has greatly boosted target discovery in the ‘omics’ era. Target discovery is the key step in the biomarker and drug discovery pipeline to diagnose and fight human diseases. In biomedical science, the ‘target’ is a broad concept ranging from molecular entities (such as genes, proteins and miRNAs) to biological phenomena (such as molecular functions, pathways and phenotypes). Within the context of biomedical science, data mining refers to a bioinformatics approach that combines biological concepts with computer tools or statistical methods that are mainly used to discover, select and prioritize targets. In response to the huge demand of data mining for target discovery in the ‘omics’ era, this review explicates various data mining approaches and their applications to target discovery with emphasis on text and microarray data analysis. Two emerging data mining approaches, chemogenomic data mining and proteomic data mining, are briefly introduced. Also discussed are the limitations of various data mining approaches found in the level of database integration, the quality of data annotation, sample heterogeneity and the performance of analytical and mining tools. Tentative strategies of integrating different data sources for target discovery, such as integrated text mining with high-throughput data analysis and integrated mining with pathway databases, are introduced. }
}
@article{Liu20124204,
  title = {Attribute-restricted latent topic model for person re-identification },
  journal = {Pattern Recognition },
  volume = {45},
  number = {12},
  pages = {4204 - 4213},
  year = {2012},
  note = {},
  issn = {0031-3203},
  doi = {http://dx.doi.org/10.1016/j.patcog.2012.05.019},
  url = {http://www.sciencedirect.com/science/article/pii/S0031320312002658},
  author = {Xiao Liu and Mingli Song and Qi Zhao and Dacheng Tao and Chun Chen and Jiajun Bu},
  keywords = {Visual attribute},
  keywords = {Attribute-restricted latent topic model},
  keywords = {Person re-identification},
  keywords = {Semantic topic },
  abstract = {Searching for specific persons from surveillance videos captured by different cameras, known as person re-identification, is a key yet under-addressed challenge. Difficulties arise from the large variations of human appearance in different poses, and from the different camera views that may be involved, making low-level descriptor representation unreliable. In this paper, we propose a novel Attribute-Restricted Latent Topic Model (ARLTM) to encode targets into semantic topics. Compared to conventional topic models such as \{LDA\} and pLSI, \{ARLTM\} performs best by imposing semantic restrictions onto the generation of human specific attributes. We use \{MCMC\} \{EM\} for model learning. Experimental results show that our method achieves state-of-the-art performance. }
}
@article{Harada20131311,
  title = {The Prediction of Ellipses Using Topic Model for Japanese Colloquial Inquiry Text },
  journal = {Procedia Computer Science },
  volume = {22},
  number = {0},
  pages = {1311 - 1318},
  year = {2013},
  note = {17th International Conference in Knowledge Based and Intelligent Information and Engineering Systems - \{KES2013\} },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2013.09.219},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050913010120},
  author = {Tomohiko Harada and Yoshikatsu Fujita and Kazuhiko Tsuda},
  keywords = {Colloquial expressions},
  keywords = {Ellipsis},
  keywords = {Statistical topic models},
  keywords = {Gibbs sampling},
  keywords = {\{LDA\} },
  abstract = {Abstract Generally inquiries through Web forms and e-mails are increasing. These inquiry texts usually include many informal ex- pressions use of the colloquial style and many omitted words. An omitted word causes the meaning of a sentence to become ambiguous and makes the reader misread and misunderstand a context. In this paper we propose a method to predict omitted words from context and knowledge using topic information. From the results of evaluation experiment, we have confirmed that some of our methods can predict omitted words at the accuracy rate more than 40% for the expression that we used in the experiment. }
}
@article{Chau2003167,
  title = {Design and evaluation of a multi-agent collaborative Web mining system },
  journal = {Decision Support Systems },
  volume = {35},
  number = {1},
  pages = {167 - 183},
  year = {2003},
  note = {Web Retrieval and Mining },
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/S0167-9236(02)00103-3},
  url = {http://www.sciencedirect.com/science/article/pii/S0167923602001033},
  author = {Michael Chau and Daniel Zeng and Hsinchun Chen and Michael Huang and David Hendriawan},
  keywords = {Web searching},
  keywords = {Web content mining},
  keywords = {Collaborative information retrieval},
  keywords = {Collaboration behavior},
  keywords = {Collaborative filtering},
  keywords = {Multi-agent systems},
  keywords = {Software agents},
  keywords = {Post-retrieval analysis },
  abstract = {Most existing Web search tools work only with individual users and do not help a user benefit from previous search experiences of others. In this paper, we present the Collaborative Spider, a multi-agent system designed to provide post-retrieval analysis and enable across-user collaboration in Web search and mining. This system allows the user to annotate search sessions and share them with other users. We also report a user study designed to evaluate the effectiveness of this system. Our experimental findings show that subjects' search performance was degraded, compared to individual search scenarios in which users had no access to previous searches, when they had access to a limited number (e.g., 1 or 2) of earlier search sessions done by other users. However, search performance improved significantly when subjects had access to more search sessions. This indicates that gain from collaboration through collaborative Web searching and analysis does not outweigh the overhead of browsing and comprehending other users' past searches until a certain number of shared sessions have been reached. In this paper, we also catalog and analyze several different types of user collaboration behavior observed in the context of Web mining. }
}
@article{Costa20124813,
  title = {A framework for building web mining applications in the world of blogs: A case study in product sentiment analysis },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {5},
  pages = {4813 - 4834},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2011.09.135},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417411014588},
  author = {Evandro Costa and Rafael Ferreira and Patrick Brito and Ig Ibert Bittencourt and Olavo Holanda and Aydano Machado and Tarsis Marinho},
  keywords = {Semantic web},
  keywords = {Social web},
  keywords = {Blogs},
  keywords = {Sentiment analysis },
  abstract = {Recently there has been much interest in electronic commerce applications that use data mining techniques to explore datasets in the social media context. However, most of the applications have already been developed in an ad hoc manner, mainly, due to the lack of adequate tools, yielding difficulties in customizing applications and requesting high time consuming for constructing and maintaining these applications. This work addresses these problems and proposes a software framework for building Web mining applications in the blog world. The architecture of the proposed framework combines the use of blog crawling and data mining algorithms, in order to provide a complete and flexible solution for building general-purpose Web mining applications. The framework flexibility allows some important customizations, such as the construction of adapters for reading text from different blogs, and the use of different pre-processing techniques and data mining algorithms. In order to improve the efficacy of information extraction from blogs, ontology is used in the blog’s description. For this, there are software agents responsible for tracking and indexing blogs related to a specific tag and for mining blog datasets. Moreover, web services are used for encapsulating existing tools and maximize reuse. This framework has been instantiated in order to be applied for helping the blog users to effectively find out relevant information in the blog world. The focus of this paper is on describing the novel software architecture of the general framework (blog crawling and data mining) providing detailed information about the data mining sub-framework, which uses the semantic web services technology for automating service composition and consists on the main research contribution. A case study of an e-commerce application for analyzing the user’s sentiment regarding specific products is reported and its results considers the effort reduction when creating a web mining application by using the proposed integrated frameworks and existing data mining tools, as well as a qualitative analysis related to quality aspects of the developed application, such as the evolution impact. }
}
@article{Wangsuk2013175,
  title = {Trajectory Mining for Keystroke Dynamics Authentication },
  journal = {Procedia Computer Science },
  volume = {24},
  number = {0},
  pages = {175 - 183},
  year = {2013},
  note = {17th Asia Pacific Symposium on Intelligent and Evolutionary Systems, \{IES2013\} },
  issn = {1877-0509},
  doi = {http://dx.doi.org/10.1016/j.procs.2013.10.041},
  url = {http://www.sciencedirect.com/science/article/pii/S1877050913011836},
  author = {Kasem Wangsuk and Tanapat Anusas-amornkul},
  keywords = {Authentication},
  keywords = {Keystroke Dynamics},
  keywords = {Trajectory Dissimilarity},
  keywords = {Feature Selections },
  abstract = {Abstract This paper focuses on enhancing a username and password authentication scheme, which has some weaknesses because a username is publicly known and a password can be guessed. When an attacker knows or guesses a password correctly, the system is compromised. Therefore, this research focuses on this weakness and proposes an additional security token to this scheme by combining keystroke dynamics into the system. A username is typically not changed but a password is required to change frequently for a better security level. A username is typed frequently such that the familiar typing can be used as a behavioral biometrics of a user. Therefore, a keystroke dynamics profile is proposed using a trajectory dissimilarity technique to verify user's typing behavior on a username as an additional authentication token. Several features are mined for keystroke dynamics to create a trajectory profile, which gives the best results of 4% equal error rate (EER) or 96% authentication accuracy. }
}
@article{Yanli2012638,
  title = {Research on Data Preprocessing In Credit Card Consuming Behavior Mining },
  journal = {Energy Procedia },
  volume = {17, Part A},
  number = {0},
  pages = {638 - 643},
  year = {2012},
  note = {2012 International Conference on Future Electrical Power and Energy System },
  issn = {1876-6102},
  doi = {http://dx.doi.org/10.1016/j.egypro.2012.02.147},
  url = {http://www.sciencedirect.com/science/article/pii/S1876610212004833},
  author = {Zhu Yan-li and Zhang Jia},
  keywords = {credit card},
  keywords = {consuming behavior},
  keywords = {data cleaning},
  keywords = {integration},
  keywords = {reduction },
  abstract = {Data source resulted from preprocessing affects directly the quality of data mining. The methods are not the same according to particular application fields and industries. This paper describes the data preprocessing of credit card in customer segmentation, association analysis and risk detection in detail. Firstly, some tables are selected from credit card database which concerned with analysis topic. Then the paper handles with problems in selected initial data such as noisy data, missing values through data preprocessing which mainly include data cleaning, integration, transformation and reduction, and obtains training sample data needed. }
}
@article{Lee20126799,
  title = {Mining knowledge demands from information flow },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {8},
  pages = {6799 - 6806},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2011.12.045},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417411017167},
  author = {Hyun Joon Lee and Jong Hwa Kim},
  keywords = {Knowledge management},
  keywords = {Information flow},
  keywords = {Recommender system},
  keywords = {Collaborative filtering },
  abstract = {Among a collaborative team, members usually come from diverse disciplines, and their demands for knowledge are also different from each other. Information flow is a type of collaborative process, which exists behind every collaborative team. This paper is concerned with how to obtain team members’ knowledge demands from the information flow. Firstly, the knowledge demands model is defined. Based on the model of knowledge demands and information filtering technologies, some approaches for mining demands from information flow are proposed. This study on the knowledge demand mining can pave the way for developing knowledge recommender systems, which can recommend proper knowledge to proper team members with a collaborative team. }
}
@article{Menczer2003195,
  title = {Complementing search engines with online web mining agents },
  journal = {Decision Support Systems },
  volume = {35},
  number = {2},
  pages = {195 - 212},
  year = {2003},
  note = {Web Data Mining },
  issn = {0167-9236},
  doi = {http://dx.doi.org/10.1016/S0167-9236(02)00106-9},
  url = {http://www.sciencedirect.com/science/article/pii/S0167923602001069},
  author = {Filippo Menczer},
  keywords = {Web mining},
  keywords = {Search engines},
  keywords = {Web intelligence},
  keywords = {InfoSpiders},
  keywords = {MySpiders},
  keywords = {Evaluation metrics},
  keywords = {Estimated recency},
  keywords = {Precision},
  keywords = {Recall },
  abstract = {While search engines have become the major decision support tools for the Internet, there is a growing disparity between the image of the World Wide Web stored in search engine repositories and the actual dynamic, distributed nature of Web data. We propose to attack this problem using an adaptive population of intelligent agents mining the Web online at query time. We discuss the benefits and shortcomings of using dynamic search strategies versus the traditional static methods in which search and retrieval are disjoint. This paper presents a public Web intelligence tool called MySpiders, a threaded multiagent system designed for information discovery. The performance of the system is evaluated by comparing its effectiveness in locating recent, relevant documents with that of search engines. We present results suggesting that augmenting search engines with adaptive populations of intelligent search agents can lead to a significant competitive advantage. We also discuss some of the challenges of evaluating such a system on current Web data, introduce three novel metrics for this purpose, and outline some of the lessons learned in the process. }
}
@article{He2002491,
  title = {Mining a Web Citation Database for author co-citation analysis },
  journal = {Information Processing & Management },
  volume = {38},
  number = {4},
  pages = {491 - 508},
  year = {2002},
  note = {},
  issn = {0306-4573},
  doi = {http://dx.doi.org/10.1016/S0306-4573(01)00046-2},
  url = {http://www.sciencedirect.com/science/article/pii/S0306457301000462},
  author = {Yulan He and Siu Cheung Hui},
  keywords = {Author co-citation analysis},
  keywords = {Data mining},
  keywords = {Web Citation Database},
  keywords = {Intelligent information retrieval },
  abstract = {Author co-citation analysis (ACA) has been widely used in bibliometrics as an analytical method in analyzing the intellectual structure of science studies. It can be used to identify authors from the same or similar research fields. However, such analysis method relies heavily on statistical tools to perform the analysis and requires human interpretation. Web Citation Database is a data warehouse used for storing citation indices of Web publications. In this paper, we propose a mining process to automate the \{ACA\} based on the Web Citation Database. The mining process uses agglomerative hierarchical clustering (AHC) as the mining technique for author clustering and multidimensional scaling (MDS) for displaying author cluster maps. The clustering results and author cluster map have been incorporated into a citation-based retrieval system known as PubSearch to support author retrieval of Web publications. }
}
@article{Ambrožič2003627,
  title = {Prediction of subsidence due to underground mining by artificial neural networks },
  journal = {Computers & Geosciences },
  volume = {29},
  number = {5},
  pages = {627 - 637},
  year = {2003},
  note = {},
  issn = {0098-3004},
  doi = {http://dx.doi.org/10.1016/S0098-3004(03)00044-X},
  url = {http://www.sciencedirect.com/science/article/pii/S009830040300044X},
  author = {Tomaž Ambrožič and Goran Turk},
  keywords = {Subsidence prediction},
  keywords = {Artificial neural network},
  keywords = {Multi-layer feed-forward neural network},
  keywords = {Approximation of functions},
  keywords = {Mining damage },
  abstract = {Alternatively to empirical prediction methods, methods based on influential functions and on mechanical model, artificial neural networks (ANNs) can be used for the surface subsidence prediction. In our case, the multi-layer feed-forward neural network was used. The training and testing of neural network is based on the available data. Input variables represent extraction parameters and coordinates of the points of interest, while the output variable represents surface subsidence data. After the neural network has been successfully trained, its performance is tested on a separate testing set. Finally, the surface subsidence trough above the projected excavation is predicted by the trained neural network. The applicability of \{ANN\} for the prediction of surface subsidence was verified in different subsidence models and proved on actual excavated levels and in levelled data on surface profile points in the Velenje Coal Mine. }
}
@article{Wang2012105,
  title = {Nearest-neighbor method using multiple neighborhood similarities for social media data mining },
  journal = {Neurocomputing },
  volume = {95},
  number = {0},
  pages = {105 - 116},
  year = {2012},
  note = {Learning from Social Media Network },
  issn = {0925-2312},
  doi = {http://dx.doi.org/10.1016/j.neucom.2011.06.039},
  url = {http://www.sciencedirect.com/science/article/pii/S0925231212001671},
  author = {Shuhui Wang and Qingming Huang and Shuqiang Jiang and Qi Tian and Lei Qin},
  keywords = {Nearest neighbor method},
  keywords = {Multiple neighborhood similarity},
  keywords = {Visual categorization},
  keywords = {Locality sensitive hashing },
  abstract = {Currently, Nearest-Neighbor approaches (NN) have been applied to large scale real world image data mining. However, the following three disadvantages prevent them from wider application compared to other machine learning methods: (i) the performance is inferior on small datasets; (ii) the performance will degrade for data with high dimensions; (iii) they are heavily dependent on the chosen feature and distance measure. In this paper, we try to overcome the three mentioned intrinsic weaknesses by taking the abundant and diversified content of social media images into account. Firstly, we propose a novel neighborhood similarity measure which encodes both the local density information and semantic information, thus it has better generalization power than the original image-to-image similarity. Secondly, to enhance the scalability, we adopt kernelized Locality Sensitive Hashing (KLSH) to conduct approximated nearest neighbor search by utilizing a set of kernels calculated on several complementary image features. Finally, to enhance the robustness on diversified genres of images, we propose to fuse the discrimination power of different features by combining multiple neighborhood similarities calculated on different features/kernels with the entire retrieved nearest labeled and unlabeled image via the hashing systems. Experimental results on visual categorization on the Caltech-256 and two social media databases show the advantage of our method over traditional \{NN\} methods using the labeled data only. }
}
@article{Zhang2002303,
  title = {A novel Web usage mining approach for search engines },
  journal = {Computer Networks },
  volume = {39},
  number = {3},
  pages = {303 - 310},
  year = {2002},
  note = {},
  issn = {1389-1286},
  doi = {http://dx.doi.org/10.1016/S1389-1286(02)00211-6},
  url = {http://www.sciencedirect.com/science/article/pii/S1389128602002116},
  author = {Dell Zhang and Yisheng Dong},
  keywords = {Web information retrieval},
  keywords = {Multimedia retrieval},
  keywords = {Data mining },
  abstract = {Web usage mining can be very useful to search engines. This paper proposes a novel effective approach to exploit the relationships among users, queries and resources based on the search engine's log. How this method can be applied is illustrated by a Chinese image search engine. }
}
@article{Bouchachia20131,
  title = {Editorial of the special issue: Online fuzzy machine learning and data mining },
  journal = {Information Sciences },
  volume = {220},
  number = {0},
  pages = {1 - 4},
  year = {2013},
  note = {Online Fuzzy Machine Learning and Data Mining },
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2012.10.005},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025512006597},
  author = {Abdelhamid Bouchachia and Edwin Lughofer and Daniel Sanchez}
}
@article{Christidis20129297,
  title = {Using latent topics to enhance search and recommendation in Enterprise Social Software },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {10},
  pages = {9297 - 9307},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.02.073},
  url = {http://www.sciencedirect.com/science/article/pii/S095741741200317X},
  author = {Konstantinos Christidis and Gregoris Mentzas and Dimitris Apostolou},
  keywords = {Enterprise Social Software},
  keywords = {Search},
  keywords = {Recommender systems},
  keywords = {Latent topic models},
  keywords = {Latent Dirichlet Allocation },
  abstract = {Enterprise Social Software refers to open and flexible organizational systems and tools which utilize Web 2.0 technologies to stimulate participation through informal interactions. A challenge in Enterprise Social Software is to discover and maintain over time the knowledge structure of topics found relevant to the organization. Knowledge structures, ranging in formality from ontologies to folksonomies, support user activity by enabling users to categorize and retrieve information resources. In this paper we enhance the search and recommendation functionalities of Enterprise Social Software by extending their knowledge structures with the addition of underlying hidden topics which we discover using probabilistic topic models. We employ Latent Dirichlet Allocation in order to elicit hidden topics and use the latter to assess similarities in resource and tag recommendation as well as for the expansion of query results. As an application of our approach we have extended the search and recommendation facilities of an open source Enterprise Social Software system which we have deployed and evaluated in five knowledge-intensive small and medium enterprises. }
}
@article{MartínezTorres201211623,
  title = {An evolutionary factor analysis computation for mining website structures },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {14},
  pages = {11623 - 11633},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.04.011},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412006082},
  author = {M.R. Martínez-Torres and S.L. Toral and B. Palacios and F. Barrero},
  keywords = {Link analysis},
  keywords = {Website structure},
  keywords = {Factor analysis},
  keywords = {Evolutionary computation},
  keywords = {Genetic algorithms },
  abstract = {This paper explores website link structure considering websites as interconnected graphs and analyzing their features as a social network. Two networks have been extracted for representing websites: a domain network containing subdomains or external domains linked through the website and a page network containing webpages browsed from the root domain. Factor analysis provides the statistical methodology to adequately extract the main website profiles in terms of their internal structure. However, due to the large number of indicators, the task of selecting a representative subset of indicators becomes unaffordable. A genetic search of an optimum subset of indicators is proposed in this paper, selecting a multi-objective fitness function based on factor analysis results. The optimum solution provides a coherent and relevant categorization of website profiles, and highlights the possibilities of genetic algorithms as a tool for discovering new knowledge in the field of web mining. }
}
@article{Wang201486,
  title = {Product aspect extraction supervised with online domain knowledge },
  journal = {Knowledge-Based Systems },
  volume = {71},
  number = {0},
  pages = {86 - 100},
  year = {2014},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2014.05.018},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705114002081},
  author = {Tao Wang and Yi Cai and Ho-fung Leung and Raymond Y.K. Lau and Qing Li and Huaqing Min},
  keywords = {Aspect extraction},
  keywords = {Product aspect},
  keywords = {Topic model},
  keywords = {Opinion mining},
  keywords = {Review summarization },
  abstract = {Abstract One of the most challenging problems in aspect-based opinion mining is aspect extraction, which aims to identify expressions that describe aspects of products (called aspect expressions) and categorize domain-specific synonymous expressions. Although a number of methods of aspect extraction have been proposed before, very few of them are designed to improve the interpretability of generated aspects. Existing methods either generate multiple fine-grained aspects without proper categorization or categorize semantically unrelated product aspects (e.g., by unsupervised topic modeling). In this paper, we first examine previous studies on product aspect extraction. To overcome the limitations of existing methods, two novel semi-supervised models for product aspect extraction are then proposed. More specifically, the proposed methodology first extracts seeding aspects and related terms from detailed product descriptions readily available on E-commerce websites. Next, product reviews are regrouped according to these seeding aspects so that more effective textual contexts for topic modeling are built. Finally, two novel semi-supervised topic models are developed to extract human-comprehensible product aspects. For the first proposed topic model, the Fine-grained Labeled \{LDA\} (FL-LDA), seeding aspects are applied to guide the model to discover words that are related to these seeding aspects. For the second model, the Unified Fine-grained Labeled \{LDA\} (UFL-LDA), we incorporate unlabeled documents to extend the FL-LDA model so that words related to the seeding aspects or other high-frequency words in customer reviews are extracted. Our experimental results demonstrate that the proposed methods outperform state-of-the-art methods. }
}
@article{Newton2012321,
  title = {International information conference on Search, Data Mining and Visualization, Nice, France, April 2012 },
  journal = {World Patent Information },
  volume = {34},
  number = {4},
  pages = {321 - 322},
  year = {2012},
  note = {},
  issn = {0172-2190},
  doi = {http://dx.doi.org/10.1016/j.wpi.2012.06.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0172219012000828},
  author = {David Newton}
}
@article{Friedman2003189,
  title = {A vocabulary development and visualization tool based on natural language processing and the mining of textual patient reports },
  journal = {Journal of Biomedical Informatics },
  volume = {36},
  number = {3},
  pages = {189 - 201},
  year = {2003},
  note = {},
  issn = {1532-0464},
  doi = {http://dx.doi.org/10.1016/j.jbi.2003.08.005},
  url = {http://www.sciencedirect.com/science/article/pii/S1532046403000704},
  author = {Carol Friedman and Hongfang Liu and Lyudmila Shagina},
  keywords = {Natural language processing},
  keywords = {Controlled vocabulary},
  keywords = {XML-based graphical user interface},
  keywords = {Text mining},
  keywords = {Medical terminology },
  abstract = {Medical terminologies are critical for automated healthcare systems. Some terminologies, such as the \{UMLS\} and \{SNOMED\} are comprehensive, whereas others specialize in limited domains (i.e., BIRADS) or are developed for specific applications. An important feature of a terminology is comprehensive coverage of relevant clinical terms and ease of use by users, which include computerized applications. We have developed a method for facilitating vocabulary development and maintenance that is based on utilization of natural language processing to mine large collections of clinical reports in order to obtain information on terminology as expressed by physicians. Once the reports are processed and the terms structured and collected into an \{XML\} representational schema, it is possible to determine information about terms, such as frequency of occurrence, compositionality, relations to other terms (such as modifiers), and correspondence to a controlled vocabulary. This paper describes the method and discusses how it can be used as a tool to help vocabulary builders navigate through the terms physicians use, visualize their relations to other terms via a flexible viewer, and determine their correspondence to a controlled vocabulary. }
}
@article{Bose2001211,
  title = {Business data mining — a machine learning perspective },
  journal = {Information & Management },
  volume = {39},
  number = {3},
  pages = {211 - 225},
  year = {2001},
  note = {},
  issn = {0378-7206},
  doi = {http://dx.doi.org/10.1016/S0378-7206(01)00091-X},
  url = {http://www.sciencedirect.com/science/article/pii/S037872060100091X},
  author = {Indranil Bose and Radha K. Mahapatra},
  keywords = {Business applications},
  keywords = {Data mining},
  keywords = {Machine learning },
  abstract = {The objective of this paper is to inform the information systems (IS) manager and business analyst about the role of machine learning techniques in business data mining. Data mining is a fast growing application area in business. Machine learning techniques are used for data analysis and pattern discovery and thus can play a key role in the development of data mining applications. Understanding the strengths and weaknesses of these techniques in the context of business is useful in selecting an appropriate method for a specific application. The paper, therefore, provides an overview of machine learning techniques and discusses their strengths and weaknesses in the context of mining business data. A survey of data mining applications in business is provided to investigate the use of learning techniques. Rule induction (RI) was found to be most popular, followed by neural networks (NNs) and case-based reasoning (CBR). Most applications were found in financial areas, where prediction of the future was a dominant task category. }
}
@article{Afendi20131,
  title = {\{DATA\} \{MINING\} \{METHODS\} \{FOR\} \{OMICS\} \{AND\} \{KNOWLEDGE\} \{OF\} \{CRUDE\} \{MEDICINAL\} \{PLANTS\} \{TOWARD\} \{BIG\} \{DATA\} \{BIOLOGY\} },
  journal = {Computational and Structural Biotechnology Journal },
  volume = {4},
  number = {5},
  pages = {1 - 14},
  year = {2013},
  note = {},
  issn = {2001-0370},
  doi = {http://dx.doi.org/10.5936/csbj.201301010},
  url = {http://www.sciencedirect.com/science/article/pii/S2001037014600532},
  author = {Farit M. Afendi and Naoaki Ono and Yukiko Nakamura and Kensuke Nakamura and Latifah K. Darusman and Nelson Kibinge and Aki Hirai Morita and Ken Tanaka and Hisayuki Horai and Md. Altaf-Ul-Amin and Shigehiko Kanaya},
  abstract = {Abstract Molecular biological data has rapidly increased with the recent progress of the Omics fields, e.g., genomics, transcriptomics, proteomics and metabolomics that necessitates the development of databases and methods for efficient storage, retrieval, integration and analysis of massive data. The present study reviews the usage of \{KNApSAcK\} Family \{DB\} in metabolomics and related area, discusses several statistical methods for handling multivariate data and shows their application on Indonesian blended herbal medicines (Jamu) as a case study. Exploration using Biplot reveals many plants are rarely utilized while some plants are highly utilized toward specific efficacy. Furthermore, the ingredients of Jamu formulas are modeled using Partial Least Squares Discriminant Analysis (PLS-DA) in order to predict their efficacy. The plants used in each Jamu medicine served as the predictors, whereas the efficacy of each Jamu provided the responses. This model produces 71.6% correct classification in predicting efficacy. Permutation test then is used to determine plants that serve as main ingredients in Jamu formula by evaluating the significance of the PLS-DA coefficients. Next, in order to explain the role of plants that serve as main ingredients in Jamu medicines, information of pharmacological activity of the plants is added to the predictor block. Then N-PLS-DA model, multiway version of PLS-DA, is utilized to handle the three-dimensional array of the predictor block. The resulting N-PLS-DA model reveals that the effects of some pharmacological activities are specific for certain efficacy and the other activities are diverse toward many efficacies. Mathematical modeling introduced in the present study can be utilized in global analysis of big data targeting to reveal the underlying biology. }
}
@article{Li2001253,
  title = {Feature space theory — a mathematical foundation for data mining },
  journal = {Knowledge-Based Systems },
  volume = {14},
  number = {5–6},
  pages = {253 - 257},
  year = {2001},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/S0950-7051(01)00103-4},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705101001034},
  author = {Hong Xing Li and Li D. Xu},
  keywords = {Knowledge representation},
  keywords = {Data mining},
  keywords = {Features},
  keywords = {Feature space},
  keywords = {Feature construction},
  keywords = {Feature selection},
  keywords = {Feature reduction},
  keywords = {Relevance analysis },
  abstract = {In data mining, an important task in classification and prediction includes feature construction, feature description, feature selection, feature relevance analysis and feature reduction. In this paper, feature space theory is introduced as a mathematical foundation for feature related concepts and techniques in data mining. }
}
@incollection{Savona2013223,
  title = {14 - Measuring Systemic Risk from Country Fundamentals: A Data Mining Approach },
  editor = {Gregoriou, Carsten S. WehnChristian HoppeGreg N. },
  booktitle = {Rethinking Valuation and Pricing Models },
  publisher = {Academic Press},
  edition = {},
  address = {},
  year = {2013},
  pages = {223 - 240},
  isbn = {978-0-12-415875-7},
  doi = {http://dx.doi.org/10.1016/B978-0-12-415875-7.00014-2},
  url = {http://www.sciencedirect.com/science/article/pii/B9780124158757000142},
  author = {Roberto Savona and Marika Vezzoli},
  abstract = {Using data from Organization for Economic Cooperation and Development economies over the period 1981–2010 together with the Reinhart–Rogoff financial crises database, in this chapter we shed some light on the inner mechanism of the systemic risk by combining bank-, currency-, inflation- and sovereign-type crises. Through a novel recursive partitioning introduced in Vezzoli and Stone (2007) combined with principal component analysis we provide a set of early warnings to accurately predict financial crises, also inferring the underlying common latent risk factor that characterizes our proxy for systemic risk. }
}
@article{Tibbett20121,
  title = {Recent advances in restoration ecology: Examining the modern Australian agro-ecological and post-mining landscapes },
  journal = {Agriculture, Ecosystems & Environment },
  volume = {163},
  number = {0},
  pages = {1 - 2},
  year = {2012},
  note = {Recent advances in restoration ecology: Examining the modern Australian agro-ecological and post-mining landscapes },
  issn = {0167-8809},
  doi = {http://dx.doi.org/10.1016/j.agee.2012.07.007},
  url = {http://www.sciencedirect.com/science/article/pii/S0167880912002678},
  author = {Mark Tibbett and David Mulligan and Patrick Audet}
}
@incollection{Jin2013409,
  title = {Chapter 18 - Three-Dimensional Mode Discrete Element Method: Elastic Model1 },
  editor = {Xu, Chuhan ZhangFeng JinJinting WangYanjie },
  booktitle = {Seismic Safety Evaluation of Concrete Dams },
  publisher = {Butterworth-Heinemann},
  edition = {},
  address = {},
  year = {2013},
  pages = {409 - 428},
  isbn = {978-0-12-408083-6},
  doi = {http://dx.doi.org/10.1016/B978-0-12-408083-6.00018-0},
  url = {http://www.sciencedirect.com/science/article/pii/B9780124080836000180},
  author = {Feng Jin and Chong Zhang and Wei Hu and Jinting Wang},
  keywords = {contact},
  keywords = {discontinuum},
  keywords = {mode discrete element method},
  keywords = {numerical method },
  abstract = {Abstract A three-dimensional mode-deformable discrete element method (3MDEM) is presented for block systems that satisfy small strain, finite displacement and finite rotation conditions. This efficient numerical method simulates the mechanical behavior of nonlinear, large deformation and dynamic problems. Under the assumption of small strain, the motion of points in a deformable block can be decomposed into the sum of the block’s rigid body motion and the deformation of the block. Kinematic equations for deformable blocks are derived. Deformation of blocks can be expressed by a series of deformation modes, which can be decoupled under given orthogonal conditions. The simulation results verify that good accuracy is achievable under the small strain condition using this method, compared with the finite element method. Under the condition of finite deformation, similar results are obtained by the proposed method and a three-dimensional deformable distinct element code, 3DEC. In verification examples, 3MDEM is more efficient than 3DEC. }
}
@article{Hong20021,
  title = {Knowledge-based data mining of news information on the Internet using cognitive maps and neural networks },
  journal = {Expert Systems with Applications },
  volume = {23},
  number = {1},
  pages = {1 - 8},
  year = {2002},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/S0957-4174(02)00022-2},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417402000222},
  author = {Taeho Hong and Ingoo Han},
  keywords = {Data mining},
  keywords = {Internet},
  keywords = {Cognitive maps},
  keywords = {Neural networks },
  abstract = {In this paper, we investigate ways to apply news information on the Internet to the prediction of interest rates. We developed the Knowledge-Based News Miner (KBNMiner), which is designed to represent the knowledge of interest rate experts with cognitive maps (CMs), to search and retrieve news information on the Internet according to prior knowledge, and to apply the information, which is retrieved from news information, to a neural network model for the prediction of interest rates. This paper focuses on improving the performance of data mining by using prior knowledge. Real-world interest rate prediction data is used to illustrate the performance of the KBNMiner. Our integrated approach, which utilizes \{CMs\} and neural networks, has been shown to be effective in experiments. While the 10-fold cross validation is used to test our research model, the experimental results of the paired t-test have been found to be statistically significant. }
}
@article{Huser20121018,
  title = {Process Mining: Discovery, Conformance and Enhancement of Business Processes },
  journal = {Journal of Biomedical Informatics },
  volume = {45},
  number = {5},
  pages = {1018 - 1019},
  year = {2012},
  note = {Text Mining and Natural Language Processing in Pharmacogenomics },
  issn = {1532-0464},
  doi = {http://dx.doi.org/10.1016/j.jbi.2012.06.007},
  url = {http://www.sciencedirect.com/science/article/pii/S1532046412000974},
  author = {Vojtech Huser}
}
@article{Hu2002735,
  title = {Mining fuzzy association rules for classification problems },
  journal = {Computers & Industrial Engineering },
  volume = {43},
  number = {4},
  pages = {735 - 750},
  year = {2002},
  note = {},
  issn = {0360-8352},
  doi = {http://dx.doi.org/10.1016/S0360-8352(02)00136-5},
  url = {http://www.sciencedirect.com/science/article/pii/S0360835202001365},
  author = {Yi-Chung Hu and Ruey-Shun Chen and Gwo-Hshiung Tzeng},
  keywords = {Data mining},
  keywords = {Knowledge acquisition},
  keywords = {Classification problems},
  keywords = {Association rules },
  abstract = {The effective development of data mining techniques for the discovery of knowledge from training samples for classification problems in industrial engineering is necessary in applications, such as group technology. This paper proposes a learning algorithm, which can be viewed as a knowledge acquisition tool, to effectively discover fuzzy association rules for classification problems. The consequence part of each rule is one class label. The proposed learning algorithm consists of two phases: one to generate large fuzzy grids from training samples by fuzzy partitioning in each attribute, and the other to generate fuzzy association rules for classification problems by large fuzzy grids. The proposed learning algorithm is implemented by scanning training samples stored in a database only once and applying a sequence of Boolean operations to generate fuzzy grids and fuzzy rules; therefore, it can be easily extended to discover other types of fuzzy association rules. The simulation results from the iris data demonstrate that the proposed learning algorithm can effectively derive fuzzy association rules for classification problems. }
}
@article{Fan20128844,
  title = {Using hybrid data mining and machine learning clustering analysis to predict the turnover rate for technology professionals },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {10},
  pages = {8844 - 8851},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.02.005},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412002473},
  author = {Chin-Yuan Fan and Pei-Shu Fan and Te-Yi Chan and Shu-Hao Chang},
  keywords = {Turnover trend},
  keywords = {Clustering analysis},
  keywords = {Self-organizing map},
  keywords = {Neural network clustering },
  abstract = {This study applies clustering analysis for data mining and machine learning to predict trends in technology professional turnover rates, including the hybrid artificial neural network and clustering analysis known as the self-organizing map (SOM). This hybrid clustering method was used to study the individual characteristics of turnover trend clusters. Using a transaction questionnaire, we studied the period of peak turnover, which occurs after the Chinese New Year, for individuals divided into various age groups. The turnover trend of technology professionals was examined in well-known Taiwanese companies. The results indicate that the high outstanding turnover trend circle was primarily caused by a lack of inner fidelity identification, leadership and management. Based on cross-verification, the clustering accuracy rate was 92.7%. This study addressed problems related to the rapid loss of key human resources and should help organizations learn how to enhance competitiveness and efficiency. }
}
@article{Lin20121486,
  title = {Compliments in Taiwan and Mainland Chinese: The influence of region and compliment topic },
  journal = {Journal of Pragmatics },
  volume = {44},
  number = {11},
  pages = {1486 - 1502},
  year = {2012},
  note = {},
  issn = {0378-2166},
  doi = {http://dx.doi.org/10.1016/j.pragma.2012.06.012},
  url = {http://www.sciencedirect.com/science/article/pii/S037821661200152X},
  author = {Chih Ying Lin and Helen Woodfield and Wei Ren},
  keywords = {Chinese compliments},
  keywords = {Regional effect},
  keywords = {Compliment topic },
  abstract = {Situated in a recently established field of variational pragmatics (Schneider and Barron, 2008), this study investigates one of the under-researched non-Indo-European languages, Chinese, with regard to the influence of macro-social and micro-social factors on compliments. More specifically, the present study focuses on the impact of region, a macro-social variable, and compliment topic, a micro-social factor, on Chinese compliments given by Taiwan Chinese and Mainland Chinese higher education students. Sixty Taiwanese and sixty Mainland Chinese, equally gendered in each group, completed a written discourse completion task consisting of eight content-enriched situations (Billmyer and Varghese, 2000) eliciting compliments. With regard to the impact of region, commonalities emerged between those compliments of Chinese students in Taiwan and Mainland China. Both groups preferred to offer Explicit compliments as well as Implicit compliments in the form of Requests, Assumptions, and Want Statements. Overall, Explicit compliments emerged as the most popular strategy. However, statistically significant differences were identified between the two groups in a few Implicit compliment strategies. Regarding the effect of compliment topic, both Taiwan and Mainland Chinese students utilized several compliment strategies in similar ways across appearance/possession and performance/ability situations. It appears that in most cases, it was compliment topic rather than the variety of Chinese which modulated the compliments by both groups. In addition, the paper suggests that compliments in Taiwan and Mainland Chinese may have undergone a change, possibly influenced by western cultures. }
}
@article{Loo200241,
  title = {A lattice-based approach for I/O efficient association rule mining },
  journal = {Information Systems },
  volume = {27},
  number = {1},
  pages = {41 - 74},
  year = {2002},
  note = {},
  issn = {0306-4379},
  doi = {http://dx.doi.org/10.1016/S0306-4379(01)00046-1},
  url = {http://www.sciencedirect.com/science/article/pii/S0306437901000461},
  author = {K.K Loo and Chi Lap Yip and Ben Kao and David Cheung},
  keywords = {Data mining},
  keywords = {Association rules},
  keywords = {LatticeApriori},
  keywords = {},
  keywords = {\{LGen\}},
  keywords = {FindLarge },
  abstract = {Most algorithms for association rule mining are variants of the basic Apriori algorithm (Agarwal and Srikant, Fast algorithms for mining association rules in databases, in: Proceedings of the 20th International Conference on Very Large Data Bases (VLDB’94), Santiago, Chile, 1994, pp. 487–499). One characteristic of these Apriori-based algorithms is that candidate itemsets are generated in rounds, with the size of the itemsets incremented by one per round. The number of database scans required by Apriori-based algorithms thus depends on the size of the biggest frequent itemsets. In this paper, we devise a more general candidate set generation algorithm, LGen, which generates candidate itemsets of multiple sizes during each database scan. We present an algorithm FindLarge which uses \{LGen\} to find frequent itemsets. We show that, given a reasonable set of suggested frequent itemsets, FindLarge can significantly reduce the number of I/O passes required. In the best cases, only two passes are sufficient to discover all the frequent itemsets irrespective of the size of the biggest ones. Two I/O-saving algorithms, namely \{DIC\} and Pincher-Search, are compared with FindLarge in a series of experiments. We discuss the conditions under which FindLarge significantly outperforms the others in terms of I/O efficiency. }
}
@article{NietoSánchez2002583,
  title = {A feature mining based approach for the classification of text documents into disjoint classes },
  journal = {Information Processing & Management },
  volume = {38},
  number = {4},
  pages = {583 - 604},
  year = {2002},
  note = {},
  issn = {0306-4573},
  doi = {http://dx.doi.org/10.1016/S0306-4573(01)00049-8},
  url = {http://www.sciencedirect.com/science/article/pii/S0306457301000498},
  author = {Salvador Nieto Sánchez and Evangelos Triantaphyllou and Donald Kraft},
  keywords = {Document classification},
  keywords = {Document indexing},
  keywords = {Vector space model},
  keywords = {Data mining},
  keywords = {One Clause At a Time (OCAT) algorithm},
  keywords = {Machine learning },
  abstract = {This paper proposes a new approach for classifying text documents into two disjoint classes. The new approach is based on extracting patterns, in the form of two logical expressions, which are defined on various features (indexing terms) of the documents. The pattern extraction is aimed at providing descriptions (in the form of two logical expressions) of the two classes of positive and negative examples. This is achieved by means of a data mining approach, called One Clause At a Time (OCAT), which is based on mathematical logic. The application of a logic-based approach to text document classification is critical when one wishes to be able to justify why a particular document has been assigned to one class versus the other class. This situation occurs, for instance, in declassifying documents that have been previously considered important to national security and thus are currently being kept as secret. Some computational experiments have investigated the effectiveness of the OCAT-based approach and compared it to the well-known vector space model (VSM). These tests also have investigated finding the best indexing terms that could be used in making these classification decisions. The results of these computational experiments on a sample of 2897 text documents from the \{TIPSTER\} collection indicate that the first approach has many advantages over the \{VSM\} approach for solving this type of text document classification problem. Moreover, a guided strategy for the OCAT-based approach is presented for deciding which document one needs to consider next while building the training example sets. }
}
@article{Xiao2001191,
  title = {Efficient mining of traversal patterns },
  journal = {Data & Knowledge Engineering },
  volume = {39},
  number = {2},
  pages = {191 - 214},
  year = {2001},
  note = {Building Web warehouse for semi-structured data },
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/S0169-023X(01)00039-8},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X01000398},
  author = {Yongqiao Xiao and Margaret H. Dunham},
  keywords = {Data mining},
  keywords = {Clickstream analysis},
  keywords = {Traversal patterns},
  keywords = {Suffix tree },
  abstract = {A new problem of mining traversal patterns from Web access logs is introduced. The traversal patterns are defined to keep duplicates as well as consecutive ordering in the sessions. Then an efficient algorithm is proposed. The algorithm is online, which allows the user to see the incremental results with respect to the scanned part of the database. The algorithm also adapts to large databases through dynamic compressions and effective pruning. Finally the algorithm is evaluated through experiments with real Web logs. }
}
@article{Galitsky201221,
  title = {Inferring the semantic properties of sentences by mining syntactic parse trees },
  journal = {Data & Knowledge Engineering },
  volume = {81–82},
  number = {0},
  pages = {21 - 45},
  year = {2012},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2012.07.003},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X12000699},
  author = {Boris A. Galitsky and Josep Lluis de la Rosa and Gábor Dobrocsi},
  keywords = {Machine learning},
  keywords = {Constituency parse tree},
  keywords = {Search re-ranking },
  abstract = {We extend the mechanism of logical generalization toward syntactic parse trees and attempt to detect semantic signals unobservable in the level of keywords. Generalization from a syntactic parse tree as a measure of syntactic similarity is defined by the obtained set of maximum common sub-trees and is performed at the level of paragraphs, sentences, phrases and individual words. We analyze the semantic features of this similarity measure and compare it with the semantics of traditional anti-unification of terms. Nearest-Neighbor machine learning is then applied to relate the sentence to a semantic class. By using a syntactic parse tree-based similarity measure instead of the bag-of-words and keyword frequency approaches, we expect to detect a subtle difference between semantic classes that is otherwise unobservable. The proposed approach is evaluated in three distinct domains in which a lack of semantic information makes the classification of sentences rather difficult. We conclude that implicit indications of semantic classes can be extracted from syntactic structures. }
}
@article{Liu2012343,
  title = {\{NORM\} situation in non-uranium mining in China },
  journal = {Annals of the \{ICRP\} },
  volume = {41},
  number = {3–4},
  pages = {343 - 351},
  year = {2012},
  note = {Proceedings of the First \{ICRP\} Symposium onthe International System of Radiological Protection },
  issn = {0146-6453},
  doi = {http://dx.doi.org/10.1016/j.icrp.2012.06.015},
  url = {http://www.sciencedirect.com/science/article/pii/S0146645312000309},
  author = {H. Liu and Z. Pan},
  keywords = {Radiation exposure},
  keywords = {Regulatory control},
  keywords = {\{NORM\} },
  abstract = {The Ministry of Environmental Protection in China is responsible for regulatory control on radiation protection from naturally occurring radioactive material (NORM). The natural radiation caused by human activities is a major contributor to public and occupational exposure in China. This paper introduces the first national census on pollution sources (target year 2007) in China, and describes \{NORM\} sites in Baotou, Inner Mongolia, one of the largest rare earth deposits in China. The ores are rich in radioactive elements, with a concentration of ThO2 of 0.01–0.05% and concentration of \{U3O8\} of 0.0005–0.002%. The large amount of \{NORM\} residues is regulated and controlled. After treatment of the waste water, it is discharged into tailing ponds and then pumped directly to milling plants for re-use. The waste gas after off-dust cleaning is discharged into the environment. A substantial amount of blast furnace iron slag from the waste treatment is transformed into cement, concrete, and bricks, or used directly for highway construction. This raises a serious environmental concern. As a result, environmental radiation monitoring and assessment have been introduced recently. Regulatory control of \{NORM\} is very important in order to take effective measures to lower the dose. }
}
@article{Sullivan2001249,
  title = {Dangers of data mining: The case of calendar effects in stock returns },
  journal = {Journal of Econometrics },
  volume = {105},
  number = {1},
  pages = {249 - 286},
  year = {2001},
  note = {Forecasting and empirical methods in finance and macroeconomics },
  issn = {0304-4076},
  doi = {http://dx.doi.org/10.1016/S0304-4076(01)00077-X},
  url = {http://www.sciencedirect.com/science/article/pii/S030440760100077X},
  author = {Ryan Sullivan and Allan Timmermann and Halbert White},
  keywords = {Data mining},
  keywords = {Market efficiency},
  keywords = {Bootstrap testing},
  keywords = {Calendar effects },
  abstract = {Economics is primarily a non-experimental science. Typically, we cannot generate new data sets on which to test hypotheses independently of the data that may have led to a particular theory. The common practice of using the same data set to formulate and test hypotheses introduces data-mining biases that, if not accounted for, invalidate the assumptions underlying classical statistical inference. A striking example of a data-driven discovery is the presence of calendar effects in stock returns. There appears to be very substantial evidence of systematic abnormal stock returns related to the day of the week, the week of the month, the month of the year, the turn of the month, holidays, and so forth. However, this evidence has largely been considered without accounting for the intensive search preceding it. In this paper we use 100 years of daily data and a new bootstrap procedure that allows us to explicitly measure the distortions in statistical inference induced by data mining. We find that although nominal p-values for individual calendar rules are extremely significant, once evaluated in the context of the full universe from which such rules were drawn, calendar effects no longer remain significant. }
}
@article{Zhang20102663,
  title = {Integrating induction and deduction for noisy data mining },
  journal = {Information Sciences },
  volume = {180},
  number = {14},
  pages = {2663 - 2673},
  year = {2010},
  note = {Including Special Section on Hybrid Intelligent Algorithms and Applications },
  issn = {0020-0255},
  doi = {http://dx.doi.org/10.1016/j.ins.2009.11.045},
  url = {http://www.sciencedirect.com/science/article/pii/S0020025509005234},
  author = {Yan Zhang and Xindong Wu},
  keywords = {Noise handling},
  keywords = {Induction},
  keywords = {Deduction},
  keywords = {Error correction },
  abstract = {Data mining research has been drawing a lot of interest and attention from various fields since late 1980s. The rapid progress has been achieved from three aspects: the prosperity of data mining conferences, the significant number of data mining algorithms, and widely applied areas of data mining techniques. With the continuing growth of the data volumes in many domains, the need of employing data mining techniques provides not only new opportunities but also immense challenges. In this article, we present our study on a challenging topic – integrating induction and deduction for noisy data mining. In particular, we assume the mechanism that corrupts the input data is a set of structured knowledge in the form of Associative Corruption (AC) rules. We apply deductive reasoning to generate the noise corruption rules; make error corrections on the input data with the help of these rules; and perform inductive learning from the corrected input data. Our experimental results show that the proposed integration framework is effective. }
}
@article{Zhu20124222,
  title = {Topic correlation and individual influence analysis in online forums },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {4},
  pages = {4222 - 4232},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2011.09.112},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417411014357},
  author = {Tian Zhu and Bai Wang and Bin Wu and Chuanxi Zhu},
  keywords = {Online network},
  keywords = {Forum},
  keywords = {Topic correlation},
  keywords = {Social influence },
  abstract = {Over the last few years, online forums have gained massive popularity and have become one of the most influential web social media in our times. The forum document corpus can be seemed to be composed of various topics evolved over time, and every topics is reflected on a volume of keywords and social actors. In this paper, we attempt to study the interesting problem: for the evolving topics, were there any correlation between them? We propose a method for discovering the dependency relationship between the topics of documents in adjacent time stamps based on the knowledge of content semantic similarity and social interactions of authors and repliers. We introduce mutual information measure to estimate the correlation between the topics. Applied to the realistic forum data, we show how topics are related and which postings can be recommended to another as similar topics. We also show how the authors impact the topics and propose a new way for evaluating author impact. }
}
@article{Liu201247,
  title = {Mining frequent patterns from univariate uncertain data },
  journal = {Data & Knowledge Engineering },
  volume = {71},
  number = {1},
  pages = {47 - 68},
  year = {2012},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2011.07.009},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X11001066},
  author = {Ying-Ho Liu},
  keywords = {Univariate uncertain data},
  keywords = {U2P-Miner},
  keywords = {U2P-tree},
  keywords = {Frequent \{U2\} pattern },
  abstract = {In this paper, we propose a new algorithm called U2P-Miner for mining frequent \{U2\} patterns from univariate uncertain data, where each attribute in a transaction is associated with a quantitative interval and a probability density function. The algorithm is implemented in two phases. First, we construct a U2P-tree that compresses the information in the target database. Then, we use the U2P-tree to discover frequent \{U2\} patterns. Potential frequent \{U2\} patterns are derived by combining base intervals and verified by traversing the U2P-tree. We also develop two techniques to speed up the mining process. Since the proposed method is based on a tree-traversing strategy, it is both efficient and scalable. Our experimental results demonstrate that the U2P-Miner algorithm outperforms three widely used algorithms, namely, the modified Apriori, modified H-mine, and modified depth-first backtracking algorithms. }
}
@article{Coulet2012825,
  title = {The state of the art in text mining and natural language processing for pharmacogenomics },
  journal = {Journal of Biomedical Informatics },
  volume = {45},
  number = {5},
  pages = {825 - 826},
  year = {2012},
  note = {Text Mining and Natural Language Processing in Pharmacogenomics },
  issn = {1532-0464},
  doi = {http://dx.doi.org/10.1016/j.jbi.2012.08.001},
  url = {http://www.sciencedirect.com/science/article/pii/S1532046412001232},
  author = {Adrien Coulet and K. Bretonnel Cohen and Russ B. Altman}
}
@article{Dařena2014103,
  title = {Clients’ Freely Written Assessment as the Source of Automatically Mined Opinions },
  journal = {Procedia Economics and Finance },
  volume = {12},
  number = {0},
  pages = {103 - 110},
  year = {2014},
  note = {17th International Conference Enterprise and Competitive Environment 2014 },
  issn = {2212-5671},
  doi = {http://dx.doi.org/10.1016/S2212-5671(14)00325-6},
  url = {http://www.sciencedirect.com/science/article/pii/S2212567114003256},
  author = {František Dařena and Jan Žižka and Jan Přichystal},
  keywords = {Medical service evaluation},
  keywords = {opinion discovery},
  keywords = {text mining},
  keywords = {machine learning},
  keywords = {clustering },
  abstract = {Abstract Measuring the quality of products or services, a challenging task is to reveal clients’ satisfaction or sentiment. As people have many opportunities to express their opinions using various on-line channels (e.g., discussions, microblogs, social networks), the question is whether such data might be used for this purpose. Information hidden in the data includes the reasons why people perceive products or services as good or bad, what are the reasons of clients’ satisfaction or dissatisfaction, or what affects their sentiment. However, having the needed large amounts of data, it is hardly possible to process it manually. This paper presents a method that aims at automatic discovery of sources of human feelings hidden in textual messages that clients produce. For a demonstration, messages having a form of freely written reviews containing subjective evaluation of medical services were used. During analysis of the data, clusters representing groups of the whole reviews (or individual sentences) with a certain requested degree of similarity were created in an unsupervised manner. Then, a decision tree classifier was trained in order to find attributes (words) of the reviews that were significant for assigning the reviews to the clusters. Because individual words were sometimes not informative enough they were subsequently used as a starting point for searching for frequent multi-word expressions. As a result, the list of multi-word phrases representing frequent and important sources of clients’ opinions was presented. }
}
@article{Ellouze201246,
  title = {CITOM: An incremental construction of multilingual topic maps },
  journal = {Data & Knowledge Engineering },
  volume = {74},
  number = {0},
  pages = {46 - 62},
  year = {2012},
  note = {Applications of Natural Language to Information Systems },
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/j.datak.2012.02.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X12000201},
  author = {Nebrasse Ellouze and Nadira Lammari and Elisabeth Métais},
  keywords = {Topic Map (TM)},
  keywords = {Incremental construction},
  keywords = {Multilingual documents},
  keywords = {Information retrieval},
  keywords = {Thesaurus },
  abstract = {This paper proposes the \{CITOM\} approach for an incremental construction of multilingual Topic Maps. Our main goal is to facilitate user's navigation across documents available in different languages. Our approach takes into account three types of information sources: (a) a set of multilingual documents, (b) a domain thesaurus and (c) all the possible questioning sources such as \{FAQ\} and user's or expert's requests about documents. In this paper we present the different steps of the proposed approach to construct the Topic Map and the pruning process of the generated Topic Map. We validate our approach with a real corpus from the sustainable construction domain. }
}
@article{Kostoff2002163,
  title = {Electrochemical power text mining using bibliometrics and database tomography },
  journal = {Journal of Power Sources },
  volume = {110},
  number = {1},
  pages = {163 - 176},
  year = {2002},
  note = {},
  issn = {0378-7753},
  doi = {http://dx.doi.org/10.1016/S0378-7753(02)00233-1},
  url = {http://www.sciencedirect.com/science/article/pii/S0378775302002331},
  author = {Ronald N Kostoff and Rene Tshiteya and Kirstin M Pfeil and James A Humenik},
  keywords = {Electrochemical power},
  keywords = {Database tomography},
  keywords = {Bibliometric analysis},
  keywords = {Text mining},
  keywords = {Information retrieval},
  keywords = {Technical intelligence },
  abstract = {Database tomography (DT) is a textual database analysis system consisting of two major components: (1) algorithms for extracting multi-word phrase frequencies and phrase proximities (physical closeness of the multi-word technical phrases) from any type of large textual database, to augment (2) interpretative capabilities of the expert human analyst. \{DT\} was used to derive technical intelligence from an electrochemical power database derived from the science citation index (SCI). Phrase frequency analysis by the technical domain experts provided the pervasive technical themes of the electrochemical power database, and the phrase proximity analysis provided the relationships among the pervasive technical themes. Bibliometric analysis of the electrochemical power literature supplemented the \{DT\} results with author/journal/institution publication and citation data. }
}
@article{Ferscha2001157,
  title = {Distributed simulation performance data mining },
  journal = {Future Generation Computer Systems },
  volume = {18},
  number = {1},
  pages = {157 - 174},
  year = {2001},
  note = {I. High Performance Numerical Methods and Applications. II. Performance Data Mining: Automated Diagnosis, Adaption, and Optimization },
  issn = {0167-739X},
  doi = {http://dx.doi.org/10.1016/S0167-739X(01)00050-4},
  url = {http://www.sciencedirect.com/science/article/pii/S0167739X01000504},
  author = {Alois Ferscha and James Johnson and Stephen J. Turner},
  keywords = {Distributed simulation (DS)},
  keywords = {Performance data mining},
  keywords = {Time Warp (TW)},
  keywords = {Chandy/Misra/Bryant (CMB) },
  abstract = {The performance of logical process based distributed simulation (DS) protocols like Time Warp and Chandy/Misra/Bryant is influenced by a variety of factors such as the event structure underlying the simulation model, the partitioning into submodels, the performance characteristics of the execution platform, the implementation of the simulation engine and optimizations related to the protocols. The mutual performance effects of parameters exhibit a prohibitively complex degree of interweaving, giving analytical performance investigations only relative relevance. Nevertheless, performance analysis is of utmost practical interest for the simulationist who wants to decide on the suitability of a certain \{DS\} protocol for a specific simulation model before substantial efforts are invested in developing sophisticated \{DS\} codes. Since \{DS\} performance prediction based on analytical models appears doubtful with respect to adequacy and accuracy, this work presents a prediction method based on the simulated execution of skeletal implementations of \{DS\} protocols. Performance data mining methods based on statistical analysis and a simulation tool for \{DS\} protocols have been developed for \{DS\} performance prediction, supporting the simulationist in three types of decision problems: (i) given a simulation problem and parallel execution platform, which \{DS\} protocol promises best performance, (ii) given a simulation model and a \{DS\} strategy, which execution platform is appropriate from the performance viewpoint, and (iii) what class of simulation models is best executed on a given multiprocessor using a certain \{DS\} protocol. Methodologically, skeletons of the most important variations of \{DS\} protocols are developed and executed in the N-MAP performance prediction environment. As a mining technique, performance data is collected and analyzed based on a full factorial design. The design predictor variables are used to explain \{DS\} performance. }
}
@article{Daud2012154,
  title = {Using time topic modeling for semantics-based dynamic research interest finding },
  journal = {Knowledge-Based Systems },
  volume = {26},
  number = {0},
  pages = {154 - 163},
  year = {2012},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/j.knosys.2011.07.015},
  url = {http://www.sciencedirect.com/science/article/pii/S0950705111001638},
  author = {Ali Daud},
  keywords = {Dynamic research interests},
  keywords = {Exchangeability of topics},
  keywords = {Time topic modeling},
  keywords = {Social networks},
  keywords = {Unsupervised machine learning },
  abstract = {Researchers interests finding has been an active area of investigation for different recommendation tasks. Previous approaches for finding researchers interests exploit writing styles and links connectivity by considering time of documents, while semantics-based intrinsic structure of words is ignored. Consequently, a topic model named Author-Topic model is proposed, which exploits semantics-based intrinsic structure of words present between the authors of research papers. It ignores simultaneous modeling of time factor which results in exchangeability of topics problem, which is, important factor to deal with when finding dynamic research interests. For example, in many real world applications, like finding reviewers for papers and finding taggers in the social tagging systems one needs to consider different time periods. In this paper, we present time topic modeling approach named Temporal-Author-Topic (TAT) which can simultaneously model text, researchers and time of research papers to overcome the exchangeability of topic problem. The mixture distribution over topics is influenced by both co-occurrences of words and timestamps of the research papers. Consequently, topics occurrence and their related researchers change over time, while the meaning of particular topic almost remains unchanged. Proposed approach is used to discover topically related researchers for different time periods. We also show how their interests and relationships change over a time period. Empirical results on large research papers corpus show the effectiveness of our proposed approach and dominance over Author-Topic (AT) model, by handling the exchangeability of topics problem, which enables it to obtain similar meaning of a particular topic overtime. }
}
@article{Ho20129054,
  title = {Using a fuzzy association rule mining approach to identify the financial data association },
  journal = {Expert Systems with Applications },
  volume = {39},
  number = {10},
  pages = {9054 - 9063},
  year = {2012},
  note = {},
  issn = {0957-4174},
  doi = {http://dx.doi.org/10.1016/j.eswa.2012.02.047},
  url = {http://www.sciencedirect.com/science/article/pii/S0957417412002916},
  author = {G.T.S. Ho and W.H. Ip and C.H. Wu and Y.K. Tse},
  keywords = {Hang Seng Index},
  keywords = {Financial data association},
  keywords = {Fuzzy association rule},
  keywords = {Fuzzy set theory },
  abstract = {In the rapidly changing financial market, investors always have difficulty in deciding the right time to trade. In order to enhance investment profitability, investors desire a decision support system. The proposed artificial intelligence methodology provides investors with the ability to learn the association among different parameters. After the associations are extracted, investors can apply the rules in their decision support systems. In this work, the model is built with the ultimate goal of predicting the level of the Hang Seng Index in Hong Kong. The movement of Hang Seng Index, which is associated with other economics indices including the gross domestic product (GDP) index, the consumer price index (CPI), the interest rate, and the export value of goods from Hong Kong, is learnt by the proposed method. The case study shows that the proposed method is a feasible way to provide decision support for investors who may not be able to identify the hidden rules between the Hang Seng Index and other economics indices. }
}
@article{Mack2002S89,
  title = {Text-based knowledge discovery: search and mining of life-sciences documents },
  journal = {Drug Discovery Today },
  volume = {7},
  number = {11},
  pages = {S89 - S98},
  year = {2002},
  note = {},
  issn = {1359-6446},
  doi = {http://dx.doi.org/10.1016/S1359-6446(02)02286-9},
  url = {http://www.sciencedirect.com/science/article/pii/S1359644602022869},
  author = {Robert Mack and Michael Hehenberger},
  keywords = {information retrieval},
  keywords = {information extraction},
  keywords = {text mining},
  keywords = {knowledge discovery},
  keywords = {biomedical ontologies },
  abstract = {Text literature is playing an increasingly important role in biomedical discovery. The challenge is to manage the increasing volume, complexity and specialization of knowledge expressed in this literature. Although information retrieval or text searching is useful, it is not sufficient to find specific facts and relations. Information extraction methods are evolving to extract automatically specific, fine-grained terms corresponding to the names of entities referred to in the text, and the relationships that connect these terms. Information extraction is, in turn, a means to an end, and knowledge discovery methods are evolving for the discovery of still more-complex structures and connections among facts. These methods provide an interpretive context for understanding the meaning of biological data. }
}
@article{Clementini2000251,
  title = {Mining multiple-level spatial association rules for objects with a broad boundary },
  journal = {Data & Knowledge Engineering },
  volume = {34},
  number = {3},
  pages = {251 - 270},
  year = {2000},
  note = {},
  issn = {0169-023X},
  doi = {http://dx.doi.org/10.1016/S0169-023X(00)00017-3},
  url = {http://www.sciencedirect.com/science/article/pii/S0169023X00000173},
  author = {Eliseo Clementini and Paolino Di Felice and Krzysztof Koperski},
  keywords = {Association rule},
  keywords = {Data mining},
  keywords = {Spatial database},
  keywords = {Topological relation},
  keywords = {Uncertainty },
  abstract = {Spatial data mining, i.e., mining knowledge from large amounts of spatial data, is a demanding field since huge amounts of spatial data have been collected in various applications, ranging from remote sensing to geographical information systems (GIS), computer cartography, environmental assessment and planning. The collected data far exceeds people's ability to analyze it. Thus, new and efficient methods are needed to discover knowledge from large spatial databases. Most of the spatial data mining methods do not take into account the uncertainty of spatial information. In our work we use objects with broad boundaries, the concept that absorbs all the uncertainty by which spatial data is commonly affected and allows computations in the presence of uncertainty without rough simplifications of the reality. The topological relations between objects with a broad boundary can be organized into a three-level concept hierarchy. We developed and implemented a method for an efficient determination of such topological relations. Based on the hierarchy of topological relations we present a method for mining spatial association rules for objects with uncertainty. The progressive refinement approach is used for the optimization of the mining process. }
}
@article{Hong200155,
  title = {Advances in predictive models for data mining },
  journal = {Pattern Recognition Letters },
  volume = {22},
  number = {1},
  pages = {55 - 61},
  year = {2001},
  note = {Machine Learning and Data Mining in Pattern Recognition },
  issn = {0167-8655},
  doi = {http://dx.doi.org/10.1016/S0167-8655(00)00099-4},
  url = {http://www.sciencedirect.com/science/article/pii/S0167865500000994},
  author = {Se June Hong and Sholom M Weiss},
  keywords = {Data mining},
  keywords = {Text mining},
  keywords = {Machine learning},
  keywords = {Boosting },
  abstract = {Expanding application demand for data mining of massive data warehouses has fueled advances in automated predictive methods. We examine a few successful application areas and their technical challenges. We review the key theoretical developments in \{PAC\} and statistical learning theory that have lead to the development of support vector machines and to the use of multiple models for increased predictive accuracy. }
}
@article{MartínezdePisón201222,
  title = {Mining association rules from time series to explain failures in a hot-dip galvanizing steel line },
  journal = {Computers & Industrial Engineering },
  volume = {63},
  number = {1},
  pages = {22 - 36},
  year = {2012},
  note = {},
  issn = {0360-8352},
  doi = {http://dx.doi.org/10.1016/j.cie.2012.01.013},
  url = {http://www.sciencedirect.com/science/article/pii/S0360835212000253},
  author = {Francisco Javier Martínez-de-Pisón and Andrés Sanz and Eduardo Martínez-de-Pisón and Emilio Jiménez and Dante Conti},
  keywords = {Cause failures},
  keywords = {Association rules},
  keywords = {Knowledge discovery},
  keywords = {Multiple time series},
  keywords = {Continuous hot-dip galvanized line },
  abstract = {This paper presents an experience based on the use of association rules from multiple time series captured from industrial processes. The main goal is to seek useful knowledge for explaining failures in these processes. An overall method is developed to obtain association rules that represent the repeated relationships between pre-defined episodes in multiple time series, using a time window and a time lag. First, the process involves working in an iterative and interactive manner with several pre-processing and segmentation algorithms for each kind of time series in order to obtain significant events. In the next step, a search is made for sequences of events called episodes that are repeated among the various time series according to a pre-set consequent, a pre-established time window and a time lag. Extraction is then made of the association rules for those episodes that appear many times and have a high rate of hits. Finally, a case study is described regarding the application of this methodology to a historical database of 150 variables from an industrial process for galvanizing steel coils. }
}
@article{Lin2001189,
  title = {A data mining approach to the prediction of corporate failure },
  journal = {Knowledge-Based Systems },
  volume = {14},
  number = {3–4},
  pages = {189 - 195},
  year = {2001},
  note = {},
  issn = {0950-7051},
  doi = {http://dx.doi.org/10.1016/S0950-7051(01)00096-X},
  url = {http://www.sciencedirect.com/science/article/pii/S095070510100096X},
  author = {Feng Yu Lin and Sally McClean},
  keywords = {Corporate failure},
  keywords = {Data mining},
  keywords = {Hybrid method },
  abstract = {This paper uses a data mining approach to the prediction of corporate failure. Initially, we use four single classifiers — discriminant analysis, logistic regression, neural networks and C5.0 — each based on two feature selection methods for predicting corporate failure. Of the two feature selection methods — human judgement based on financial theory and \{ANOVA\} statistical method — we found the \{ANOVA\} method performs better than the human judgement method in all classifiers except discriminant analysis. Among the individual classifiers, decision trees and neural networks were found to provide better results. Finally, a hybrid method that combines the best features of several classification models is developed to increase the prediction performance. The empirical tests show that such a hybrid method produces higher prediction accuracy than individual classifiers. }
}
@incollection{Wu201227,
  title = {2 - Integrative Mining of Traditional Chinese Medicine Literature and \{MEDLINE\} for Functional Gene Networks1 },
  editor = {Jiang, Zhaohui WuHuajun ChenXiaohong },
  booktitle = {Modern Computational Approaches to Traditional Chinese Medicine },
  publisher = {Elsevier},
  edition = {},
  address = {Oxford},
  year = {2012},
  pages = {27 - 52},
  isbn = {978-0-12-398510-1},
  doi = {http://dx.doi.org/10.1016/B978-0-12-398510-1.00002-9},
  url = {http://www.sciencedirect.com/science/article/pii/B9780123985101000029},
  author = {Zhaohui Wu and Huajun Chen and Xiaohong Jiang},
  abstract = {The amount of biomedical data in different disciplines is growing at an exponential rate. Integrating these significant knowledge sources to generate novel hypotheses for systems biology research is difficult. Compared to modern biomedical science, traditional Chinese medicine (TCM) is a complementary knowledge system and a completely different discipline. This chapter uses a significant \{TCM\} bibliographic literature database in China, together with MEDLINE, to help discover novel gene functional knowledge. We present an integrative mining approach to uncovering the functional gene relationships from \{MEDLINE\} and \{TCM\} bibliographic literature. This chapter introduces \{TCM\} literature (about 50,000 records) as one knowledge source for constructing literature-based gene networks. We use the \{TCM\} diagnosis and \{TCM\} syndrome to automatically congregate the related genes. The syndrome–gene relationships are discovered based on the syndrome–disease relationships extracted from \{TCM\} literature and the disease–gene relationships in MEDLINE. Based on the bubble-bootstrapping and relation weight computing methods, we have developed a prototype system called MeDisco/3S, which has name entity and relation extraction, and online analytical processing (OLAP) capabilities, to perform the integrative mining process. We have got about 200,000 syndrome–gene relations, which could help generate syndrome-based gene networks and help analyze the functional knowledge of genes from the syndrome perspective. We take the gene network of Kidney Yang Deficiency (KYD) syndrome and the functional analysis of some genes, such as \{CRH\} (corticotropin-releasing hormone), \{PTH\} (parathyroid hormone), \{PRL\} (prolactin), \{BRCA1\} (breast cancer 1, early onset), and \{BRCA2\} (breast cancer 2, early onset) to demonstrate the preliminary results. The underlying hypothesis is that the related genes of the same syndrome will have some biological functional relationships and will constitute a functional network. This chapter presents an approach to integrating \{TCM\} literature and modern biomedical data to discover novel gene networks and functional knowledge of genes. The preliminary results show that the novel gene functional knowledge and gene networks, which are worthy of further investigation, could be generated by integrating the two complementary biomedical data sources. It will be a promising research field through integrative mining of \{TCM\} and modern life science literature. Keywords System biology, traditional Chinese medicine, literature mining, gene network, syndrome perspective, gene functional analysis, scientific hypotheses }
}
@article{He2012359,
  title = {Mining patterns of author orders in scientific publications },
  journal = {Journal of Informetrics },
  volume = {6},
  number = {3},
  pages = {359 - 367},
  year = {2012},
  note = {},
  issn = {1751-1577},
  doi = {http://dx.doi.org/10.1016/j.joi.2012.01.001},
  url = {http://www.sciencedirect.com/science/article/pii/S1751157712000028},
  author = {Bing He and Ying Ding and Erjia Yan},
  keywords = {Author orders},
  keywords = {Author sequence},
  keywords = {Scientific collaboration },
  abstract = {The author order of multi-authored papers can reveal subtle patterns of scientific collaboration and provide insights on the nature of credit assignment among coauthors. This article proposes a sequence-based perspective on scientific collaboration. Using frequently occurring sequences as the unit of analysis, this study explores (1) what types of sequence patterns are most common in the scientific collaboration at the level of authors, institutions, U.S. states, and nations in Library and Information Science (LIS); and (2) the productivity (measured by number of papers) and influence (measured by citation counts) of different types of sequence patterns. Results show that (1) the productivity and influence approximately follow the power law for frequent sequences in the four levels of analysis; (2) the productivity and influence present a significant positive correlation among frequent sequences, and the strength of the correlation increases with the level of integration; (3) for author-level, institution-level, and state-level frequent sequences, short geographical distances between the authors usually co-present with high productivities, while long distances tend to co-occur with large citation counts; (4) for author-level frequent sequences, the pattern of “the more productive and prestigious authors ranking ahead” is the one with the highest productivity and the highest influence; however, in the rest of the levels of analysis, the pattern with the highest productivity and the highest influence is the one with “the less productive and prestigious institutions/states/nations ranking ahead.” }
}
@article{Hong1999363,
  title = {Mining association rules from quantitative data },
  journal = {Intelligent Data Analysis },
  volume = {3},
  number = {5},
  pages = {363 - 376},
  year = {1999},
  note = {},
  issn = {1088-467X},
  doi = {http://dx.doi.org/10.1016/S1088-467X(99)00028-1},
  url = {http://www.sciencedirect.com/science/article/pii/S1088467X99000281},
  author = {Tzung-Pei Hong and Chan-Sheng Kuo and Sheng-Chai Chi},
  keywords = {Data-mining},
  keywords = {Fuzzy set},
  keywords = {Association rule},
  keywords = {Transaction},
  keywords = {Quantitative value },
  abstract = {Data-mining is the process of extracting desirable knowledge or interesting patterns from existing databases for specific purposes. Most conventional data-mining algorithms identify the relationships among transactions using binary values, however, transactions with quantitative values are commonly seen in real-world applications. This paper thus proposes a new data-mining algorithm for extracting interesting knowledge from transactions stored as quantitative values. The proposed algorithm integrates fuzzy set concepts and the apriori mining algorithm to find interesting fuzzy association rules in given transaction data sets. Experiments with student grades at I-Shou University were also made to verify the performance of the proposed algorithm. }
}
@article{Marks201222,
  title = {Mining the next big thing },
  journal = {New Scientist },
  volume = {214},
  number = {2871},
  pages = {22 - },
  year = {2012},
  note = {},
  issn = {0262-4079},
  doi = {http://dx.doi.org/10.1016/S0262-4079(12)61672-8},
  url = {http://www.sciencedirect.com/science/article/pii/S0262407912616728},
  author = {Paul Marks},
  abstract = {Patent citations could help predict where technology is headed }
}
@incollection{Wu2012171,
  title = {10 - Semantic Association Mining for Traditional Chinese Medicine },
  editor = {Jiang, Zhaohui WuHuajun ChenXiaohong },
  booktitle = {Modern Computational Approaches to Traditional Chinese Medicine },
  publisher = {Elsevier},
  edition = {},
  address = {Oxford},
  year = {2012},
  pages = {171 - 198},
  isbn = {978-0-12-398510-1},
  doi = {http://dx.doi.org/10.1016/B978-0-12-398510-1.00010-8},
  url = {http://www.sciencedirect.com/science/article/pii/B9780123985101000108},
  author = {Zhaohui Wu and Huajun Chen and Xiaohong Jiang},
  abstract = {Domain-driven data mining (D3M) seeks to “meta-synthesize” a variety of intelligence resources for actionable knowledge discovery in complex domain problems. We present an application of \{D3M\} in Integrated Medicine, an interdisciplinary subject that explores and interprets the knowledge assets of traditional Chinese medicine (TCM) by scientific methods in order to discover novel medical knowledge. We propose an ontology-based technical framework for hypothesis-driven Semantic Association Mining (SAM), which allows a knowledge network to emerge through the communication of Semantic Associations (SAs) by a multitude of agents in terms of hypotheses and evidence. Agents extract patterns from their own semantic graphs driven by the need to solve published hypotheses. Agents can publish discovered patterns as partial evidence that depends on some other hypotheses. We establish a multiagent environment to simulate the situation of the Semantic Web and evaluate the feasibility and scalability of hypothesis-driven \{SAM\} through multiagent collaboration. Through this mechanism, we utilize a shared domain ontology to synthesize complementary knowledge elements from both orthodox and traditional medicine, in order to support collaborative knowledge discovery and validation. The simulation shows that the framework can scale up to a large number of agents and ensure the quality of discovered SAs. We demonstrate a case study that discovers and integrates relationships and interactions centered on \{TCM\} herbs from distributed data sources. The resulting “HerbNet,” instead of a static dataset, is essentially an open and evolving knowledge network that is capable of answering complex domain problems and supporting hypothesis-driven knowledge discovery tasks. Keywords Semantic Association Mining, domain ontology, traditional Chinese medicine, hypothesis driven, graph network }
}
@article{Şah201241,
  title = {Automatic metadata mining from multilingual enterprise content },
  journal = {Web Semantics: Science, Services and Agents on the World Wide Web },
  volume = {11},
  number = {0},
  pages = {41 - 62},
  year = {2012},
  note = {},
  issn = {1570-8268},
  doi = {http://dx.doi.org/10.1016/j.websem.2011.11.001},
  url = {http://www.sciencedirect.com/science/article/pii/S1570826811000801},
  author = {Melike Şah and Vincent Wade},
  keywords = {Automatic metadata generation},
  keywords = {Ontologies},
  keywords = {Personalization},
  keywords = {Fuzzy information granulation and fuzzy inference },
  abstract = {Personalization is increasingly vital especially for enterprises to be able to reach their customers. The key challenge in supporting personalization is the need for rich metadata, such as metadata about structural relationships, subject/concept relations between documents and cognitive metadata about documents (e.g. difficulty of a document). Manual annotation of large knowledge bases with such rich metadata is not scalable. As well as, automatic mining of cognitive metadata is challenging since it is very difficult to understand underlying intellectual knowledge about document automatically. On the other hand, the Web content is increasing becoming multilingual since growing amount of data generated on the Web is non-English. Current metadata extraction systems are generally based on English content and this requires to be revolutionized in order to adapt to the changing dynamics of the Web. To alleviate these problems, we introduce a novel automatic metadata extraction framework, which is based on a novel fuzzy based method for automatic cognitive metadata generation and uses different document parsing algorithms to extract rich metadata from multilingual enterprise content using the newly developed DocBook, Resource Type and Topic ontologies. Since the metadata generation process is based upon DocBook structured enterprise content, our framework is focused on enterprise documents and content which is loosely based on the DocBook type of formatting. DocBook is a common documentation formatting to formally produce corporate data and it is adopted by many enterprises. The proposed framework is illustrated and evaluated on English, German and French versions of the Symantec Norton 360 knowledge bases. The user study showed that the proposed fuzzy-based method generates reasonably accurate values with an average precision of 89.39% on the metadata values of document difficulty, document interactivity level and document interactivity type. The proposed fuzzy inference system achieves improved results compared to a rule-based reasoner for difficulty metadata extraction (∼11% enhancement). In addition, user perceived metadata quality scores (mean of 5.57 out of 6) found to be high and automated metadata analysis showed that the extracted metadata is high quality and can be suitable for personalized information retrieval. }
}
@article{Kunz2013362,
  title = {Implementing an integrated approach to water management by matching problem complexity with management responses: a case study of a mine site water committee },
  journal = {Journal of Cleaner Production },
  volume = {52},
  number = {0},
  pages = {362 - 373},
  year = {2013},
  note = {},
  issn = {0959-6526},
  doi = {http://dx.doi.org/10.1016/j.jclepro.2013.03.018},
  url = {http://www.sciencedirect.com/science/article/pii/S0959652613001522},
  author = {N.C. Kunz and C.J. Moran and T. Kastelle},
  keywords = {Water governance},
  keywords = {Water management},
  keywords = {Water resources},
  keywords = {Complex system},
  keywords = {Social network analysis},
  keywords = {Systems approach},
  keywords = {Mining },
  abstract = {Abstract An integrated approach is considered important for improving water management across a range of contexts – from water catchments to urban systems to mining sites. Challenges are faced in implementation due to the complexities in both human and physical (or engineered) dimensions. From a physical perspective, water systems are complex because issues are interconnected across spatial and temporal scales making it difficult to determine where to intervene to attain desired outcomes. The supporting human systems are also complex. Whether bounded at the company, regional or catchment scale, water systems are rarely controlled by a single actor or institution. For example, catchments extend across political boundaries, while different organisational departments share responsibilities for water reticulation through industrial sites. Effective water management requires coordinating decisions between diverse actors. This paper makes three contributions. First, from a consolidation of literature, three principles emerge that, if followed, should improve the management of a complex water system: (I) define whether the water issues to be addressed are simpler or more complex in nature; (II) discern whether the required response is more tactical or strategic; and (III) acknowledge the existence of boundaries and use them effectively. Second, a framework is proposed to support implementation of these principles using the mining industry as a test bed. The third contribution is an application at a case study site to examine the utility of the principles and the value of the framework. It is concluded that problems arising in a water system will be managed most effectively when problem complexity is “matched” with an equivalent management response. This practical approach may assist other industrial sites in comprehending the nature of the water issues affecting operations (simple to complex) and to assign the appropriate level of management authority relevant to the decision at hand (tactical to strategic). The framework may also have utility for managing water in a broader context, such as water catchments that lie across political boundaries. This may ultimately improve implementation of overarching concepts such as integrated water resources management (IWRM), assisting the recognised need to progress from theory to practice. }
}
@article{Mørup201254,
  title = {Archetypal analysis for machine learning and data mining },
  journal = {Neurocomputing },
  volume = {80},
  number = {0},
  pages = {54 - 63},
  year = {2012},
  note = {Special Issue on Machine Learning for Signal Processing 2010 },
  issn = {0925-2312},
  doi = {http://dx.doi.org/10.1016/j.neucom.2011.06.033},
  url = {http://www.sciencedirect.com/science/article/pii/S0925231211006060},
  author = {Morten Mørup and Lars Kai Hansen},
  keywords = {Archetypal analysis},
  keywords = {Principal convex hull},
  keywords = {Clustering},
  keywords = {Non-negative matrix factorization},
  keywords = {FurthestFirst},
  keywords = {FurthestSum},
  keywords = {Kernel methods },
  abstract = {Archetypal analysis (aa) proposed by Cutler and Breiman (1994) [7] estimates the principal convex hull (pch) of a data set. As such aa favors features that constitute representative ‘corners’ of the data, i.e., distinct aspects or archetypes. We currently show that aa enjoys the interpretability of clustering – without being limited to hard assignment and the uniqueness of svd – without being limited to orthogonal representations. In order to do large scale aa, we derive an efficient algorithm based on projected gradient as well as an initialization procedure we denote FurthestSum that is inspired by the FurthestFirst approach widely used for k-means (Hochbaum and Shmoys, 1985 [14]). We generalize the aa procedure to kernel-aa in order to extract the principal convex hull in potential infinite Hilbert spaces and derive a relaxation of aa when the archetypes cannot be represented as convex combinations of the observed data. We further demonstrate that the aa model is relevant for feature extraction and dimensionality reduction for a large variety of machine learning problems taken from computer vision, neuroimaging, chemistry, text mining and collaborative filtering leading to highly interpretable representations of the dynamics in the data. Matlab code for the derived algorithms is available for download from www.mortenmorup.dk. }
}
@article{Miller2012437,
  title = {Fundamentals of government information: Mining, finding, evaluating, and using government resources, Eric J. Forte, Cassandra J. Hartnett, Andrea L. Sevetson (Eds.). Neal-Schuman Publishers, New York (2011), ISBN: 978-1-55570-737-8 },
  journal = {Government Information Quarterly },
  volume = {29},
  number = {3},
  pages = {437 - 438},
  year = {2012},
  note = {},
  issn = {0740-624X},
  doi = {http://dx.doi.org/10.1016/j.giq.2012.04.001},
  url = {http://www.sciencedirect.com/science/article/pii/S0740624X12000421},
  author = {Barbara Miller}
}
@article{Adhikari2012121,
  title = {Temporal Data Mining, 1st Edition, Theophano Mitsa. Chapman & Hall, \{CRC\} (2010), 373 },
  journal = {Computer Science Review },
  volume = {6},
  number = {2–3},
  pages = {121 - 124},
  year = {2012},
  note = {},
  issn = {1574-0137},
  doi = {http://dx.doi.org/10.1016/j.cosrev.2012.02.001},
  url = {http://www.sciencedirect.com/science/article/pii/S157401371200010X},
  author = {Jhimli Adhikari}
}
@article{Wu2012617,
  title = {Locally discriminative topic modeling },
  journal = {Pattern Recognition },
  volume = {45},
  number = {1},
  pages = {617 - 625},
  year = {2012},
  note = {},
  issn = {0031-3203},
  doi = {http://dx.doi.org/10.1016/j.patcog.2011.04.029},
  url = {http://www.sciencedirect.com/science/article/pii/S0031320311001889},
  author = {Hao Wu and Jiajun Bu and Chun Chen and Jianke Zhu and Lijun Zhang and Haifeng Liu and Can Wang and Deng Cai},
  keywords = {Topic modeling},
  keywords = {Generative},
  keywords = {Discriminative},
  keywords = {Local learning },
  abstract = {Topic modeling is a powerful tool for discovering the underlying or hidden structure in text corpora. Typical algorithms for topic modeling include probabilistic latent semantic analysis (PLSA) and latent Dirichlet allocation (LDA). Despite their different inspirations, both approaches are instances of generative model, whereas the discriminative structure of the documents is ignored. In this paper, we propose locally discriminative topic model (LDTM), a novel topic modeling approach which considers both generative and discriminative structures of the data space. Different from \{PLSA\} and \{LDA\} in which the topic distribution of a document is dependent on all the other documents, \{LDTM\} takes a local perspective that the topic distribution of each document is strongly dependent on its neighbors. By modeling the local relationships of documents within each neighborhood via a local linear model, we learn topic distributions that vary smoothly along the geodesics of the data manifold, and can better capture the discriminative structure in the data. The experimental results on text clustering and web page categorization demonstrate the effectiveness of our proposed approach. }
}
@article{Chen2000527,
  title = {Intelligent materials processing by hyperspace data mining },
  journal = {Engineering Applications of Artificial Intelligence },
  volume = {13},
  number = {5},
  pages = {527 - 532},
  year = {2000},
  note = {},
  issn = {0952-1976},
  doi = {http://dx.doi.org/10.1016/S0952-1976(00)00032-4},
  url = {http://www.sciencedirect.com/science/article/pii/S0952197600000324},
  author = {Nianyi Chen and Dongping Daniel Zhu and Wenhua Wang},
  keywords = {Data mining},
  keywords = {Materials design},
  keywords = {Features},
  keywords = {Optimization },
  abstract = {This paper discusses application of hyperspace data mining to materials manufacturing. We introduce an innovative hyperspace method whereby data are separated into subspaces, features selected according to data patterns, and control rendered in the original feature space. This technique has three major advantages: no equipment is added, no experiment is needed, and no interruption occurs to production. A number of proprietary algorithms have been built into a software product, MasterMinerTM, for use in materials design and manufacturing. Examples are given to show the efficacy of the proposed method and MasterMiner tool. }
}
@article{deHaan2011923,
  title = {Framing flexibility: Theorising and data mining to develop a useful definition of flexibility and related concepts },
  journal = {Futures },
  volume = {43},
  number = {9},
  pages = {923 - 933},
  year = {2011},
  note = {Special Issue: Flexible infrastructures },
  issn = {0016-3287},
  doi = {http://dx.doi.org/10.1016/j.futures.2011.06.002},
  url = {http://www.sciencedirect.com/science/article/pii/S0016328711001364},
  author = {J. de Haan and J.H. Kwakkel and W.E. Walker and J. Spirco and W.A.H. Thissen},
  abstract = {Flexibility is a term used in various fields with widely differing interpretations. Moreover, several related concepts, such as adaptability, exist that have an overlap in meaning or are simply used synonymously. This article presents a framing of flexibility, and three concepts with which it bears a close family resemblance, for the use in the context of infrastructure constellations. The definitions proposed in this frame draw inspiration from existing literature, though they are not based upon a classical literature review. Rather, a usable set of definitions is proposed for the intended context. The definitions all have the same structure to better appreciate how the concepts are related and how they differ. To verify whether the definitions correspond to their practical use, a data-mining exercise is performed on over 11,000 scientific articles that use the concepts of flexibility. After the corpus of articles is identified that is close to the intended field of application (infrastructure constellations), a co-occurrence analysis is carried out in order to clarify the differences between the concepts and to give nuance to the meaning conveyed in the definitions. }
}
@article{Zhang2012428,
  title = {Inferring functional miRNA–mRNA regulatory modules in epithelial–mesenchymal transition with a probabilistic topic model },
  journal = {Computers in Biology and Medicine },
  volume = {42},
  number = {4},
  pages = {428 - 437},
  year = {2012},
  note = {},
  issn = {0010-4825},
  doi = {http://dx.doi.org/10.1016/j.compbiomed.2011.12.011},
  url = {http://www.sciencedirect.com/science/article/pii/S0010482511002514},
  author = {Junpeng Zhang and Bing Liu and Jianfeng He and Lei Ma and Jiuyong Li},
  keywords = {miRNA},
  keywords = {mRNA},
  keywords = {Functional regulatory modules},
  keywords = {Epithelial–mesenchymal transition},
  keywords = {Probabilistic topic model },
  abstract = {MicroRNAs (miRNAs) play important roles in gene regulatory networks. In this paper, we propose a probabilistic topic model to infer regulatory networks of miRNAs and their target mRNAs for specific biological conditions at the post-transcriptional level, so-called functional miRNA–mRNA regulatory modules (FMRMs). The probabilistic model used in this paper can effectively capture the relationship between miRNAs and mRNAs in specific cellular conditions. Furthermore, the proposed method identifies negatively and positively correlated miRNA–mRNA pairs which are associated with epithelial, mesenchymal, and other condition in \{EMT\} (epithelial–mesenchymal transition) data set, respectively. Results on \{EMT\} data sets show that the inferred \{FMRMs\} can potentially construct the biological chain of ‘ miRNA → mRNA → condition ’ at the post-transcriptional level. }
}
@article{Ferscha2001127,
  title = {Performance data mining: Automated diagnosis, adaption, and optimization },
  journal = {Future Generation Computer Systems },
  volume = {18},
  number = {1},
  pages = {127 - 130},
  year = {2001},
  note = {I. High Performance Numerical Methods and Applications. II. Performance Data Mining: Automated Diagnosis, Adaption, and Optimization },
  issn = {0167-739X},
  doi = {http://dx.doi.org/10.1016/S0167-739X(01)00047-4},
  url = {http://www.sciencedirect.com/science/article/pii/S0167739X01000474},
  author = {Alois Ferscha and Allen D Malony},
  keywords = {Performance data mining},
  keywords = {Parallel},
  keywords = {Diagnosis},
  keywords = {Adaption},
  keywords = {Optimization }
}

This file was generated by bibtex2html 1.96.