Repository: incubator-griffin-site Updated Branches: refs/heads/master 6c65212e6 -> c8612f14b
first introduction Project: http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/commit/c8612f14 Tree: http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/tree/c8612f14 Diff: http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/diff/c8612f14 Branch: refs/heads/master Commit: c8612f14bed3cab35e150dc8ad9eabd67baff1f1 Parents: 6c65212 Author: William Guo <[email protected]> Authored: Mon Mar 20 13:41:06 2017 -0700 Committer: William Guo <[email protected]> Committed: Mon Mar 20 13:41:06 2017 -0700 ---------------------------------------------------------------------- _config.yml | 4 +- db.json | 2 +- source/_posts/hello-world.md | 79 ++++++++++++++++++++++---------- source/_posts/hello.md | 5 -- source/images/Business_Process.png | Bin 0 -> 33482 bytes 5 files changed, 59 insertions(+), 31 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/blob/c8612f14/_config.yml ---------------------------------------------------------------------- diff --git a/_config.yml b/_config.yml index ca2bd64..ac4b691 100644 --- a/_config.yml +++ b/_config.yml @@ -3,10 +3,10 @@ ## Source: https://github.com/hexojs/hexo/ # Site -title: Hexo +title: Apache Griffin subtitle: description: -author: John Doe +author: William Guo language: timezone: http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/blob/c8612f14/db.json ---------------------------------------------------------------------- diff --git a/db.json b/db.json index d4b1d64..01a5163 100644 --- a/db.json +++ b/db.json @@ -1 +1 @@ -{"meta":{"version":1,"warehouse":"2.2.0"},"models":{"Asset":[],"Cache":[],"Category":[],"Data":[],"Page":[],"Post":[],"PostAsset":[],"PostCategory":[],"PostTag":[],"Tag":[]}} \ No newline at end of file +{"meta":{"version":1,"warehouse":"2.2.0"},"models":{"Asset":[{"_id":"themes/landscape/source/css/style.styl","path":"css/style.styl","modified":0,"renderable":1},{"_id":"themes/landscape/source/fancybox/fancybox_loading.gif","path":"fancybox/fancybox_loading.gif","modified":0,"renderable":1},{"_id":"themes/landscape/source/fancybox/blank.gif","path":"fancybox/blank.gif","modified":0,"renderable":1},{"_id":"themes/landscape/source/fancybox/[email protected]","path":"fancybox/[email protected]","modified":0,"renderable":1},{"_id":"themes/landscape/source/fancybox/fancybox_overlay.png","path":"fancybox/fancybox_overlay.png","modified":0,"renderable":1},{"_id":"themes/landscape/source/fancybox/fancybox_sprite.png","path":"fancybox/fancybox_sprite.png","modified":0,"renderable":1},{"_id":"themes/landscape/source/fancybox/[email protected]","path":"fancybox/[email protected]","modified":0,"renderable":1},{"_id":"themes/landscape/source/fancybox/jquery.fancybox.css"," path":"fancybox/jquery.fancybox.css","modified":0,"renderable":1},{"_id":"themes/landscape/source/js/script.js","path":"js/script.js","modified":0,"renderable":1},{"_id":"themes/landscape/source/fancybox/jquery.fancybox.pack.js","path":"fancybox/jquery.fancybox.pack.js","modified":0,"renderable":1},{"_id":"themes/landscape/source/fancybox/jquery.fancybox.js","path":"fancybox/jquery.fancybox.js","modified":0,"renderable":1},{"_id":"themes/landscape/source/css/fonts/fontawesome-webfont.eot","path":"css/fonts/fontawesome-webfont.eot","modified":0,"renderable":1},{"_id":"themes/landscape/source/css/fonts/FontAwesome.otf","path":"css/fonts/FontAwesome.otf","modified":0,"renderable":1},{"_id":"themes/landscape/source/css/fonts/fontawesome-webfont.woff","path":"css/fonts/fontawesome-webfont.woff","modified":0,"renderable":1},{"_id":"themes/landscape/source/fancybox/helpers/fancybox_buttons.png","path":"fancybox/helpers/fancybox_buttons.png","modified":0,"renderable":1},{"_id":"themes/lands cape/source/fancybox/helpers/jquery.fancybox-buttons.css","path":"fancybox/helpers/jquery.fancybox-buttons.css","modified":0,"renderable":1},{"_id":"themes/landscape/source/fancybox/helpers/jquery.fancybox-buttons.js","path":"fancybox/helpers/jquery.fancybox-buttons.js","modified":0,"renderable":1},{"_id":"themes/landscape/source/fancybox/helpers/jquery.fancybox-media.js","path":"fancybox/helpers/jquery.fancybox-media.js","modified":0,"renderable":1},{"_id":"themes/landscape/source/fancybox/helpers/jquery.fancybox-thumbs.css","path":"fancybox/helpers/jquery.fancybox-thumbs.css","modified":0,"renderable":1},{"_id":"themes/landscape/source/fancybox/helpers/jquery.fancybox-thumbs.js","path":"fancybox/helpers/jquery.fancybox-thumbs.js","modified":0,"renderable":1},{"_id":"themes/landscape/source/css/fonts/fontawesome-webfont.ttf","path":"css/fonts/fontawesome-webfont.ttf","modified":0,"renderable":1},{"_id":"themes/landscape/source/css/fonts/fontawesome-webfont.svg","path":"css/fonts/fo ntawesome-webfont.svg","modified":0,"renderable":1},{"_id":"themes/landscape/source/css/images/banner.jpg","path":"css/images/banner.jpg","modified":0,"renderable":1},{"_id":"source/images/Business_Process.png","path":"images/Business_Process.png","modified":0,"renderable":0}],"Cache":[{"_id":"themes/landscape/.gitignore","hash":"58d26d4b5f2f94c2d02a4e4a448088e4a2527c77","modified":1490040584000},{"_id":"themes/landscape/Gruntfile.js","hash":"71adaeaac1f3cc56e36c49d549b8d8a72235c9b9","modified":1490040584000},{"_id":"themes/landscape/LICENSE","hash":"c480fce396b23997ee23cc535518ffaaf7f458f8","modified":1490040584000},{"_id":"themes/landscape/README.md","hash":"c7e83cfe8f2c724fc9cac32bd71bb5faf9ceeddb","modified":1490040584000},{"_id":"themes/landscape/_config.yml","hash":"fb8c98a0f6ff9f962637f329c22699721854cd73","modified":1490040584000},{"_id":"themes/landscape/package.json","hash":"85358dc34311c6662e841584e206a4679183943f","modified":1490040584000},{"_id":"source/_posts/hello-wor ld.md","hash":"0637e7741a9bb9db8b8c77a06dc6d55753762546","modified":1490042358000},{"_id":"themes/landscape/languages/default.yml","hash":"3083f319b352d21d80fc5e20113ddf27889c9d11","modified":1490040584000},{"_id":"themes/landscape/languages/fr.yml","hash":"84ab164b37c6abf625473e9a0c18f6f815dd5fd9","modified":1490040584000},{"_id":"themes/landscape/languages/nl.yml","hash":"12ed59faba1fc4e8cdd1d42ab55ef518dde8039c","modified":1490040584000},{"_id":"themes/landscape/languages/no.yml","hash":"965a171e70347215ec726952e63f5b47930931ef","modified":1490040584000},{"_id":"themes/landscape/languages/ru.yml","hash":"4fda301bbd8b39f2c714e2c934eccc4b27c0a2b0","modified":1490040584000},{"_id":"themes/landscape/languages/zh-CN.yml","hash":"ca40697097ab0b3672a80b455d3f4081292d1eed","modified":1490040584000},{"_id":"themes/landscape/languages/zh-TW.yml","hash":"53ce3000c5f767759c7d2c4efcaa9049788599c3","modified":1490040584000},{"_id":"themes/landscape/layout/archive.ejs","hash":"2703b07cc8ac64ae4 6d1d263f4653013c7e1666b","modified":1490040584000},{"_id":"themes/landscape/layout/category.ejs","hash":"765426a9c8236828dc34759e604cc2c52292835a","modified":1490040584000},{"_id":"themes/landscape/layout/index.ejs","hash":"aa1b4456907bdb43e629be3931547e2d29ac58c8","modified":1490040584000},{"_id":"themes/landscape/layout/layout.ejs","hash":"f155824ca6130080bb057fa3e868a743c69c4cf5","modified":1490040584000},{"_id":"themes/landscape/layout/page.ejs","hash":"7d80e4e36b14d30a7cd2ac1f61376d9ebf264e8b","modified":1490040584000},{"_id":"themes/landscape/layout/post.ejs","hash":"7d80e4e36b14d30a7cd2ac1f61376d9ebf264e8b","modified":1490040584000},{"_id":"themes/landscape/layout/tag.ejs","hash":"eaa7b4ccb2ca7befb90142e4e68995fb1ea68b2e","modified":1490040584000},{"_id":"themes/landscape/scripts/fancybox.js","hash":"aa411cd072399df1ddc8e2181a3204678a5177d9","modified":1490040584000},{"_id":"themes/landscape/layout/_partial/after-footer.ejs","hash":"82a30f81c0e8ba4a8af17acd6cc99e93834e4d5e"," modified":1490040584000},{"_id":"themes/landscape/layout/_partial/archive.ejs","hash":"931aaaffa0910a48199388ede576184ff15793ee","modified":1490040584000},{"_id":"themes/landscape/layout/_partial/archive-post.ejs","hash":"c7a71425a946d05414c069ec91811b5c09a92c47","modified":1490040584000},{"_id":"themes/landscape/layout/_partial/article.ejs","hash":"c4c835615d96a950d51fa2c3b5d64d0596534fed","modified":1490040584000},{"_id":"themes/landscape/layout/_partial/footer.ejs","hash":"93518893cf91287e797ebac543c560e2a63b8d0e","modified":1490040584000},{"_id":"themes/landscape/layout/_partial/google-analytics.ejs","hash":"f921e7f9223d7c95165e0f835f353b2938e40c45","modified":1490040584000},{"_id":"themes/landscape/layout/_partial/head.ejs","hash":"4fe8853e864d192701c03e5cd3a5390287b90612","modified":1490040584000},{"_id":"themes/landscape/layout/_partial/header.ejs","hash":"c21ca56f419d01a9f49c27b6be9f4a98402b2aa3","modified":1490040584000},{"_id":"themes/landscape/layout/_partial/mobile-nav.e js","hash":"e952a532dfc583930a666b9d4479c32d4a84b44e","modified":1490040584000},{"_id":"themes/landscape/layout/_partial/sidebar.ejs","hash":"930da35cc2d447a92e5ee8f835735e6fd2232469","modified":1490040584000},{"_id":"themes/landscape/layout/_widget/archive.ejs","hash":"beb4a86fcc82a9bdda9289b59db5a1988918bec3","modified":1490040584000},{"_id":"themes/landscape/layout/_widget/category.ejs","hash":"dd1e5af3c6af3f5d6c85dfd5ca1766faed6a0b05","modified":1490040584000},{"_id":"themes/landscape/layout/_widget/recent_posts.ejs","hash":"0d4f064733f8b9e45c0ce131fe4a689d570c883a","modified":1490040584000},{"_id":"themes/landscape/layout/_widget/tagcloud.ejs","hash":"b4a2079101643f63993dcdb32925c9b071763b46","modified":1490040584000},{"_id":"themes/landscape/layout/_widget/tag.ejs","hash":"2de380865df9ab5f577f7d3bcadf44261eb5faae","modified":1490040584000},{"_id":"themes/landscape/source/css/_extend.styl","hash":"222fbe6d222531d61c1ef0f868c90f747b1c2ced","modified":1490040584000},{"_id":"theme s/landscape/source/css/_variables.styl","hash":"5e37a6571caf87149af83ac1cc0cdef99f117350","modified":1490040584000},{"_id":"themes/landscape/source/css/style.styl","hash":"a70d9c44dac348d742702f6ba87e5bb3084d65db","modified":1490040584000},{"_id":"themes/landscape/source/fancybox/fancybox_loading.gif","hash":"1a755fb2599f3a313cc6cfdb14df043f8c14a99c","modified":1490040584000},{"_id":"themes/landscape/source/fancybox/blank.gif","hash":"2daeaa8b5f19f0bc209d976c02bd6acb51b00b0a","modified":1490040584000},{"_id":"themes/landscape/source/fancybox/[email protected]","hash":"273b123496a42ba45c3416adb027cd99745058b0","modified":1490040584000},{"_id":"themes/landscape/source/fancybox/fancybox_overlay.png","hash":"b3a4ee645ba494f52840ef8412015ba0f465dbe0","modified":1490040584000},{"_id":"themes/landscape/source/fancybox/fancybox_sprite.png","hash":"17df19f97628e77be09c352bf27425faea248251","modified":1490040584000},{"_id":"themes/landscape/source/fancybox/[email protected]","hash" :"30c58913f327e28f466a00f4c1ac8001b560aed8","modified":1490040584000},{"_id":"themes/landscape/source/fancybox/jquery.fancybox.css","hash":"aaa582fb9eb4b7092dc69fcb2d5b1c20cca58ab6","modified":1490040584000},{"_id":"themes/landscape/source/js/script.js","hash":"2876e0b19ce557fca38d7c6f49ca55922ab666a1","modified":1490040584000},{"_id":"themes/landscape/source/fancybox/jquery.fancybox.pack.js","hash":"9e0d51ca1dbe66f6c0c7aefd552dc8122e694a6e","modified":1490040584000},{"_id":"themes/landscape/source/fancybox/jquery.fancybox.js","hash":"d08b03a42d5c4ba456ef8ba33116fdbb7a9cabed","modified":1490040584000},{"_id":"themes/landscape/layout/_partial/post/category.ejs","hash":"c6bcd0e04271ffca81da25bcff5adf3d46f02fc0","modified":1490040584000},{"_id":"themes/landscape/layout/_partial/post/date.ejs","hash":"6197802873157656e3077c5099a7dda3d3b01c29","modified":1490040584000},{"_id":"themes/landscape/layout/_partial/post/gallery.ejs","hash":"3d9d81a3c693ff2378ef06ddb6810254e509de5b","modified": 1490040584000},{"_id":"themes/landscape/layout/_partial/post/nav.ejs","hash":"16a904de7bceccbb36b4267565f2215704db2880","modified":1490040584000},{"_id":"themes/landscape/layout/_partial/post/tag.ejs","hash":"2fcb0bf9c8847a644167a27824c9bb19ac74dd14","modified":1490040584000},{"_id":"themes/landscape/layout/_partial/post/title.ejs","hash":"2f275739b6f1193c123646a5a31f37d48644c667","modified":1490040584000},{"_id":"themes/landscape/source/css/_partial/archive.styl","hash":"db15f5677dc68f1730e82190bab69c24611ca292","modified":1490040584000},{"_id":"themes/landscape/source/css/_partial/article.styl","hash":"10685f8787a79f79c9a26c2f943253450c498e3e","modified":1490040584000},{"_id":"themes/landscape/source/css/_partial/comment.styl","hash":"79d280d8d203abb3bd933ca9b8e38c78ec684987","modified":1490040584000},{"_id":"themes/landscape/source/css/_partial/footer.styl","hash":"e35a060b8512031048919709a8e7b1ec0e40bc1b","modified":1490040584000},{"_id":"themes/landscape/source/css/_partial/hea der.styl","hash":"85ab11e082f4dd86dde72bed653d57ec5381f30c","modified":1490040584000},{"_id":"themes/landscape/source/css/_partial/highlight.styl","hash":"bf4e7be1968dad495b04e83c95eac14c4d0ad7c0","modified":1490040584000},{"_id":"themes/landscape/source/css/_partial/mobile.styl","hash":"a399cf9e1e1cec3e4269066e2948d7ae5854d745","modified":1490040584000},{"_id":"themes/landscape/source/css/_partial/sidebar-aside.styl","hash":"890349df5145abf46ce7712010c89237900b3713","modified":1490040584000},{"_id":"themes/landscape/source/css/_partial/sidebar-bottom.styl","hash":"8fd4f30d319542babfd31f087ddbac550f000a8a","modified":1490040584000},{"_id":"themes/landscape/source/css/_partial/sidebar.styl","hash":"404ec059dc674a48b9ab89cd83f258dec4dcb24d","modified":1490040584000},{"_id":"themes/landscape/source/css/_util/grid.styl","hash":"0bf55ee5d09f193e249083602ac5fcdb1e571aed","modified":1490040584000},{"_id":"themes/landscape/source/css/_util/mixin.styl","hash":"44f32767d9fd3c1c08a60d91f181ee5 3c8f0dbb3","modified":1490040584000},{"_id":"themes/landscape/source/css/fonts/fontawesome-webfont.eot","hash":"7619748fe34c64fb157a57f6d4ef3678f63a8f5e","modified":1490040584000},{"_id":"themes/landscape/source/css/fonts/FontAwesome.otf","hash":"b5b4f9be85f91f10799e87a083da1d050f842734","modified":1490040584000},{"_id":"themes/landscape/source/css/fonts/fontawesome-webfont.woff","hash":"04c3bf56d87a0828935bd6b4aee859995f321693","modified":1490040584000},{"_id":"themes/landscape/source/fancybox/helpers/fancybox_buttons.png","hash":"e385b139516c6813dcd64b8fc431c364ceafe5f3","modified":1490040584000},{"_id":"themes/landscape/source/fancybox/helpers/jquery.fancybox-buttons.css","hash":"1a9d8e5c22b371fcc69d4dbbb823d9c39f04c0c8","modified":1490040584000},{"_id":"themes/landscape/source/fancybox/helpers/jquery.fancybox-buttons.js","hash":"dc3645529a4bf72983a39fa34c1eb9146e082019","modified":1490040584000},{"_id":"themes/landscape/source/fancybox/helpers/jquery.fancybox-media.js","hash":"2 94420f9ff20f4e3584d212b0c262a00a96ecdb3","modified":1490040584000},{"_id":"themes/landscape/source/fancybox/helpers/jquery.fancybox-thumbs.css","hash":"4ac329c16a5277592fc12a37cca3d72ca4ec292f","modified":1490040584000},{"_id":"themes/landscape/source/fancybox/helpers/jquery.fancybox-thumbs.js","hash":"47da1ae5401c24b5c17cc18e2730780f5c1a7a0c","modified":1490040584000},{"_id":"themes/landscape/source/css/fonts/fontawesome-webfont.ttf","hash":"7f09c97f333917034ad08fa7295e916c9f72fd3f","modified":1490040584000},{"_id":"themes/landscape/source/css/fonts/fontawesome-webfont.svg","hash":"46fcc0194d75a0ddac0a038aee41b23456784814","modified":1490040584000},{"_id":"themes/landscape/source/css/images/banner.jpg","hash":"f44aa591089fcb3ec79770a1e102fd3289a7c6a6","modified":1490040584000},{"_id":"public/2017/03/20/hello-world/index.html","hash":"763f2d23f38c98ef40d246acee4c5fb1e4b90f03","modified":1490042215015},{"_id":"public/archives/index.html","hash":"144262bd43b07b6c968651e9dfa0f9e281fe6d 1a","modified":1490042067730},{"_id":"public/archives/2017/index.html","hash":"c3c977357060f220a6e698d02fc2ccac45384d14","modified":1490042067732},{"_id":"public/index.html","hash":"45e436e11b9a07916c9bd37e42e2eb75b155df14","modified":1490042067732},{"_id":"public/archives/2017/03/index.html","hash":"ccd23b71511266d975544d19fda445d15979d458","modified":1490042067732},{"_id":"source/images/Business_Process.png","hash":"07776b4ec09c3ca286f1d0d1537cd89d3c053dff","modified":1489114942000},{"_id":"public/images/Business_Process.png","hash":"07776b4ec09c3ca286f1d0d1537cd89d3c053dff","modified":1490042215016}],"Category":[],"Data":[],"Page":[],"Post":[{"title":"Apache Griffin","_content":"\n## Abstract\nApache Griffin is a Data Quality Service platform built on Apache Hadoop and Apache Spark. It provides a framework process for defining data quality model, executing data quality measurement, automating data profiling and validation, as well as a unified data quality visualization across mu ltiple data systems. It tries to address the data quality challenges in big data and streaming context.\n\n\n## Overview of Apache Griffin \nAt eBay, when people use big data (Hadoop or other streaming systems), measurement of data quality is a big challenge. Different teams have built customized tools to detect and analyze data quality issues within their own domains. As a platform organization, we think of taking a platform approach to commonly occurring patterns. As such, we are building a platform to provide shared Infrastructure and generic features to solve common data quality pain points. This would enable us to build trusted data assets.\n\nCurrently it is very difficult and costly to do data quality validation when we have large volumes of related data flowing across multi-platforms (streaming and batch). Take eBay's Real-time Personalization Platform as a sample; Everyday we have to validate the data quality for ~600M records. Data quality often becomes one big challenge in this complex environment and massive scale.\n\nWe detect the following at eBay:\n\n1. Lack of an end-to-end, unified view of data quality from multiple data sources to target applications that takes into account the lineage of the data. This results in a long time to identify and fix data quality issues.\n2. Lack of a system to measure data quality in streaming mode through self-service. The need is for a system where datasets can be registered, data quality models can be defined, data quality can be visualized and monitored using a simple tool and teams alerted when an issue is detected.\n3. Lack of a Shared platform and API Service. Every team should not have to apply and manage own hardware and software infrastructure to solve this common problem.\n\nWith these in mind, we decided to build Apache Griffin - A data quality service that aims to solve the above short-comings.\n\nApache Griffin includes:\n\n**Data Quality Model Engine**: Apache Griffin is model driven solution, us er can choose various data quality dimension to execute his/her data quality validation based on selected target data-set or source data-set ( as the golden reference data). It has corresponding library supporting it in back-end for the following measurement:\n\n - Accuracy - Does data reflect the real-world objects or a verifiable source\n - Completeness - Is all necessary data present\n - Validity - Are all data values within the data domains specified by the business\n - Timeliness - Is the data available at the time needed\n - Anomaly detection - Pre-built algorithm functions for the identification of items, events or observations which do not conform to an expected pattern or other items in a dataset\n - Data Profiling - Apply statistical analysis and assessment of data values within a dataset for consistency, uniqueness and logic.\n\n**Data Collection Layer**:\n\nWe support two kinds of data sources, batch data and real time data.\n\nFor batch mode, we can collect data sourc e from our Hadoop platform by various data connectors.\n\nFor real time mode, we can connect with messaging system like Kafka to near real time analysis.\n\n**Data Process and Storage Layer**:\n\nFor batch analysis, our data quality model will compute data quality metrics in our spark cluster based on data source in hadoop.\n\nFor near real time analysis, we consume data from messaging system, then our data quality model will compute our real time data quality metrics in our spark cluster. for data storage, we use time series database in our back end to fulfill front end request.\n\n**Apache Griffin Service**:\n\nWe have RESTful web services to accomplish all the functionalities of Apache Griffin, such as register data-set, create data quality model, publish metrics, retrieve metrics, add subscription, etc. So, the developers can develop their own user interface based on these web serivces.\n\n## Main business process\nHere's the business process diagram\n\n\n\n## Rationale\nThe challenge we face at eBay is that our data volume is becoming bigger and bigger, systems process become more complex, while we do not have a unified data quality solution to ensure the trusted data sets which provide confidences on data quality to our data consumers. The key challenges on data quality includes:\n\n1. Existing commercial data quality solution cannot address data quality lineage among systems, cannot scale out to support fast growing data at eBay\n2. Existing eBay's domain specific tools take a long time to identify and fix poor data quality when data flowed through multiple systems\n3. Business logic becomes complex, requires data quality system much flexible.\n4. Some data quality issues do have business impact on user experiences, revenue, efficiency & compliance.\n5. Communication overhead of data quality metrics, typically in a big organization, which involve different teams.\n\nThe idea of Apache Apache Griffin is to provide Data Quality validation as a Service, to allow data engineers and data consumers to have:\n\n - Near real-time understanding of the data quality health of your data pipelines with end-to-end monitoring, all in one place.\n - Profiling, detecting and correlating issues and providing recommendations that drive rapid and focused troubleshooting\n - A centralized data quality model management system including rule, metadata, scheduler etc. \n - Native code generation to run everywhere, including Hadoop, Kafka, Spark, etc.\n - One set of tools to build data quality pipelines across all eBay data platforms.\n","source":"_posts/hello-world.md","raw":"---\ntitle: Apache Griffin\n---\n\n## Abstract\nApache Griffin is a Data Quality Service platform built on Apache Hadoop and Apache Spark. It provides a framework process for defining data quality model, executing data quality measurement, automating data profiling and validation, as well as a unified data quality visualization across multiple da ta systems. It tries to address the data quality challenges in big data and streaming context.\n\n\n## Overview of Apache Griffin \nAt eBay, when people use big data (Hadoop or other streaming systems), measurement of data quality is a big challenge. Different teams have built customized tools to detect and analyze data quality issues within their own domains. As a platform organization, we think of taking a platform approach to commonly occurring patterns. As such, we are building a platform to provide shared Infrastructure and generic features to solve common data quality pain points. This would enable us to build trusted data assets.\n\nCurrently it is very difficult and costly to do data quality validation when we have large volumes of related data flowing across multi-platforms (streaming and batch). Take eBay's Real-time Personalization Platform as a sample; Everyday we have to validate the data quality for ~600M records. Data quality often becomes one big challenge in this complex environment and massive scale.\n\nWe detect the following at eBay:\n\n1. Lack of an end-to-end, unified view of data quality from multiple data sources to target applications that takes into account the lineage of the data. This results in a long time to identify and fix data quality issues.\n2. Lack of a system to measure data quality in streaming mode through self-service. The need is for a system where datasets can be registered, data quality models can be defined, data quality can be visualized and monitored using a simple tool and teams alerted when an issue is detected.\n3. Lack of a Shared platform and API Service. Every team should not have to apply and manage own hardware and software infrastructure to solve this common problem.\n\nWith these in mind, we decided to build Apache Griffin - A data quality service that aims to solve the above short-comings.\n\nApache Griffin includes:\n\n**Data Quality Model Engine**: Apache Griffin is model driven solution, user can ch oose various data quality dimension to execute his/her data quality validation based on selected target data-set or source data-set ( as the golden reference data). It has corresponding library supporting it in back-end for the following measurement:\n\n - Accuracy - Does data reflect the real-world objects or a verifiable source\n - Completeness - Is all necessary data present\n - Validity - Are all data values within the data domains specified by the business\n - Timeliness - Is the data available at the time needed\n - Anomaly detection - Pre-built algorithm functions for the identification of items, events or observations which do not conform to an expected pattern or other items in a dataset\n - Data Profiling - Apply statistical analysis and assessment of data values within a dataset for consistency, uniqueness and logic.\n\n**Data Collection Layer**:\n\nWe support two kinds of data sources, batch data and real time data.\n\nFor batch mode, we can collect data source from o ur Hadoop platform by various data connectors.\n\nFor real time mode, we can connect with messaging system like Kafka to near real time analysis.\n\n**Data Process and Storage Layer**:\n\nFor batch analysis, our data quality model will compute data quality metrics in our spark cluster based on data source in hadoop.\n\nFor near real time analysis, we consume data from messaging system, then our data quality model will compute our real time data quality metrics in our spark cluster. for data storage, we use time series database in our back end to fulfill front end request.\n\n**Apache Griffin Service**:\n\nWe have RESTful web services to accomplish all the functionalities of Apache Griffin, such as register data-set, create data quality model, publish metrics, retrieve metrics, add subscription, etc. So, the developers can develop their own user interface based on these web serivces.\n\n## Main business process\nHere's the business process diagram\n\n \n\n## Rationale\nThe challenge we face at eBay is that our data volume is becoming bigger and bigger, systems process become more complex, while we do not have a unified data quality solution to ensure the trusted data sets which provide confidences on data quality to our data consumers. The key challenges on data quality includes:\n\n1. Existing commercial data quality solution cannot address data quality lineage among systems, cannot scale out to support fast growing data at eBay\n2. Existing eBay's domain specific tools take a long time to identify and fix poor data quality when data flowed through multiple systems\n3. Business logic becomes complex, requires data quality system much flexible.\n4. Some data quality issues do have business impact on user experiences, revenue, efficiency & compliance.\n5. Communication overhead of data quality metrics, typically in a big organization, which involve different teams.\n\nThe idea of Apache Apache Griffin is to provide Data Quality validation as a Service, to allow data engineers and data consumers to have:\n\n - Near real-time understanding of the data quality health of your data pipelines with end-to-end monitoring, all in one place.\n - Profiling, detecting and correlating issues and providing recommendations that drive rapid and focused troubleshooting\n - A centralized data quality model management system including rule, metadata, scheduler etc. \n - Native code generation to run everywhere, including Hadoop, Kafka, Spark, etc.\n - One set of tools to build data quality pipelines across all eBay data platforms.\n","slug":"hello-world","published":1,"date":"2017-03-20T20:09:44.000Z","updated":"2017-03-20T20:39:18.000Z","_id":"cj0ikfzhw0000pgpouj7009vf","comments":1,"layout":"post","photos":[],"link":"","content":"<h2 id=\"Abstract\"><a href=\"#Abstract\" class=\"headerlink\" title=\"Abstract\"></a>Abstract</h2><p>Apache Griffin is a Data Quality Service platform built on Apache Hadoop and Apache Spark. It provides a framework process for defining data quality model, executing data quality measurement, automating data profiling and validation, as well as a unified data quality visualization across multiple data systems. It tries to address the data quality challenges in big data and streaming context.</p>\n<h2 id=\"Overview-of-Apache-Griffin\"><a href=\"#Overview-of-Apache-Griffin\" class=\"headerlink\" title=\"Overview of Apache Griffin\"></a>Overview of Apache Griffin</h2><p>At eBay, when people use big data (Hadoop or other streaming systems), measurement of data quality is a big challenge. Different teams have built customized tools to detect and analyze data quality issues within their own domains. As a platform organization, we think of taking a platform approach to commonly occurring patterns. As such, we are building a platform to provide shared Infrastructure and generic features to solve common data quality pain points. This would enable us to build trusted data assets.</p >\n<p>Currently it is very difficult and costly to do data quality validation >when we have large volumes of related data flowing across multi-platforms >(streaming and batch). Take eBayâs Real-time Personalization Platform as a >sample; Everyday we have to validate the data quality for ~600M records. Data >quality often becomes one big challenge in this complex environment and >massive scale.</p>\n<p>We detect the following at eBay:</p>\n<ol>\n<li>Lack >of an end-to-end, unified view of data quality from multiple data sources to >target applications that takes into account the lineage of the data. This >results in a long time to identify and fix data quality >issues.</li>\n<li>Lack of a system to measure data quality in streaming mode >through self-service. The need is for a system where datasets can be >registered, data quality models can be defined, data quality can be >visualized and monitored using a simple tool and teams alerted when an issue >is detected.</li>\n<li>Lack of a Shared plat form and API Service. Every team should not have to apply and manage own hardware and software infrastructure to solve this common problem.</li>\n</ol>\n<p>With these in mind, we decided to build Apache Griffin - A data quality service that aims to solve the above short-comings.</p>\n<p>Apache Griffin includes:</p>\n<p><strong>Data Quality Model Engine</strong>: Apache Griffin is model driven solution, user can choose various data quality dimension to execute his/her data quality validation based on selected target data-set or source data-set ( as the golden reference data). It has corresponding library supporting it in back-end for the following measurement:</p>\n<ul>\n<li>Accuracy - Does data reflect the real-world objects or a verifiable source</li>\n<li>Completeness - Is all necessary data present</li>\n<li>Validity - Are all data values within the data domains specified by the business</li>\n<li>Timeliness - Is the data available at the time needed</li>\n<li>Anomaly detection - Pre-built algorithm functions for the identification of items, events or observations which do not conform to an expected pattern or other items in a dataset</li>\n<li>Data Profiling - Apply statistical analysis and assessment of data values within a dataset for consistency, uniqueness and logic.</li>\n</ul>\n<p><strong>Data Collection Layer</strong>:</p>\n<p>We support two kinds of data sources, batch data and real time data.</p>\n<p>For batch mode, we can collect data source from our Hadoop platform by various data connectors.</p>\n<p>For real time mode, we can connect with messaging system like Kafka to near real time analysis.</p>\n<p><strong>Data Process and Storage Layer</strong>:</p>\n<p>For batch analysis, our data quality model will compute data quality metrics in our spark cluster based on data source in hadoop.</p>\n<p>For near real time analysis, we consume data from messaging system, then our data quality model will compute our real time data quality metrics in our spark cluster. for data storage, we use time series database in our back end to fulfill front end request.</p>\n<p><strong>Apache Griffin Service</strong>:</p>\n<p>We have RESTful web services to accomplish all the functionalities of Apache Griffin, such as register data-set, create data quality model, publish metrics, retrieve metrics, add subscription, etc. So, the developers can develop their own user interface based on these web serivces.</p>\n<h2 id=\"Main-business-process\"><a href=\"#Main-business-process\" class=\"headerlink\" title=\"Main business process\"></a>Main business process</h2><p>Hereâs the business process diagram</p>\n<p><img src=\"/images/Business_Process.png\" alt=\"\"></p>\n<h2 id=\"Rationale\"><a href=\"#Rationale\" class=\"headerlink\" title=\"Rationale\"></a>Rationale</h2><p>The challenge we face at eBay is that our data volume is becoming bigger and bigger, systems process become more complex, while we do not have a unified data quality solution to ensu re the trusted data sets which provide confidences on data quality to our data consumers. The key challenges on data quality includes:</p>\n<ol>\n<li>Existing commercial data quality solution cannot address data quality lineage among systems, cannot scale out to support fast growing data at eBay</li>\n<li>Existing eBayâs domain specific tools take a long time to identify and fix poor data quality when data flowed through multiple systems</li>\n<li>Business logic becomes complex, requires data quality system much flexible.</li>\n<li>Some data quality issues do have business impact on user experiences, revenue, efficiency & compliance.</li>\n<li>Communication overhead of data quality metrics, typically in a big organization, which involve different teams.</li>\n</ol>\n<p>The idea of Apache Apache Griffin is to provide Data Quality validation as a Service, to allow data engineers and data consumers to have:</p>\n<ul>\n<li>Near real-time understanding of the data quality health of your data pipelines with end-to-end monitoring, all in one place.</li>\n<li>Profiling, detecting and correlating issues and providing recommendations that drive rapid and focused troubleshooting</li>\n<li>A centralized data quality model management system including rule, metadata, scheduler etc. </li>\n<li>Native code generation to run everywhere, including Hadoop, Kafka, Spark, etc.</li>\n<li>One set of tools to build data quality pipelines across all eBay data platforms.</li>\n</ul>\n","excerpt":"","more":"<h2 id=\"Abstract\"><a href=\"#Abstract\" class=\"headerlink\" title=\"Abstract\"></a>Abstract</h2><p>Apache Griffin is a Data Quality Service platform built on Apache Hadoop and Apache Spark. It provides a framework process for defining data quality model, executing data quality measurement, automating data profiling and validation, as well as a unified data quality visualization across multiple data systems. It tries to address the data quality challenges in big data and streaming context.</p>\n<h2 id=\"Overview-of-Apache-Griffin\"><a href=\"#Overview-of-Apache-Griffin\" class=\"headerlink\" title=\"Overview of Apache Griffin\"></a>Overview of Apache Griffin</h2><p>At eBay, when people use big data (Hadoop or other streaming systems), measurement of data quality is a big challenge. Different teams have built customized tools to detect and analyze data quality issues within their own domains. As a platform organization, we think of taking a platform approach to commonly occurring patterns. As such, we are building a platform to provide shared Infrastructure and generic features to solve common data quality pain points. This would enable us to build trusted data assets.</p>\n<p>Currently it is very difficult and costly to do data quality validation when we have large volumes of related data flowing across multi-platforms (streaming and batch). Take eBayâs Real-time Personalization Platform as a sample; Everyday we have to validate the data quality f or ~600M records. Data quality often becomes one big challenge in this complex environment and massive scale.</p>\n<p>We detect the following at eBay:</p>\n<ol>\n<li>Lack of an end-to-end, unified view of data quality from multiple data sources to target applications that takes into account the lineage of the data. This results in a long time to identify and fix data quality issues.</li>\n<li>Lack of a system to measure data quality in streaming mode through self-service. The need is for a system where datasets can be registered, data quality models can be defined, data quality can be visualized and monitored using a simple tool and teams alerted when an issue is detected.</li>\n<li>Lack of a Shared platform and API Service. Every team should not have to apply and manage own hardware and software infrastructure to solve this common problem.</li>\n</ol>\n<p>With these in mind, we decided to build Apache Griffin - A data quality service that aims to solve the above short-comings.</p>\ n<p>Apache Griffin includes:</p>\n<p><strong>Data Quality Model Engine</strong>: Apache Griffin is model driven solution, user can choose various data quality dimension to execute his/her data quality validation based on selected target data-set or source data-set ( as the golden reference data). It has corresponding library supporting it in back-end for the following measurement:</p>\n<ul>\n<li>Accuracy - Does data reflect the real-world objects or a verifiable source</li>\n<li>Completeness - Is all necessary data present</li>\n<li>Validity - Are all data values within the data domains specified by the business</li>\n<li>Timeliness - Is the data available at the time needed</li>\n<li>Anomaly detection - Pre-built algorithm functions for the identification of items, events or observations which do not conform to an expected pattern or other items in a dataset</li>\n<li>Data Profiling - Apply statistical analysis and assessment of data values within a dataset for consistency, uniqu eness and logic.</li>\n</ul>\n<p><strong>Data Collection Layer</strong>:</p>\n<p>We support two kinds of data sources, batch data and real time data.</p>\n<p>For batch mode, we can collect data source from our Hadoop platform by various data connectors.</p>\n<p>For real time mode, we can connect with messaging system like Kafka to near real time analysis.</p>\n<p><strong>Data Process and Storage Layer</strong>:</p>\n<p>For batch analysis, our data quality model will compute data quality metrics in our spark cluster based on data source in hadoop.</p>\n<p>For near real time analysis, we consume data from messaging system, then our data quality model will compute our real time data quality metrics in our spark cluster. for data storage, we use time series database in our back end to fulfill front end request.</p>\n<p><strong>Apache Griffin Service</strong>:</p>\n<p>We have RESTful web services to accomplish all the functionalities of Apache Griffin, such as register data-set, create data quality model, publish metrics, retrieve metrics, add subscription, etc. So, the developers can develop their own user interface based on these web serivces.</p>\n<h2 id=\"Main-business-process\"><a href=\"#Main-business-process\" class=\"headerlink\" title=\"Main business process\"></a>Main business process</h2><p>Hereâs the business process diagram</p>\n<p><img src=\"/images/Business_Process.png\" alt=\"\"></p>\n<h2 id=\"Rationale\"><a href=\"#Rationale\" class=\"headerlink\" title=\"Rationale\"></a>Rationale</h2><p>The challenge we face at eBay is that our data volume is becoming bigger and bigger, systems process become more complex, while we do not have a unified data quality solution to ensure the trusted data sets which provide confidences on data quality to our data consumers. The key challenges on data quality includes:</p>\n<ol>\n<li>Existing commercial data quality solution cannot address data quality lineage among systems, cannot scale out to support fast growing data at eBay</li>\n<li>Existing eBayâs domain specific tools take a long time to identify and fix poor data quality when data flowed through multiple systems</li>\n<li>Business logic becomes complex, requires data quality system much flexible.</li>\n<li>Some data quality issues do have business impact on user experiences, revenue, efficiency & compliance.</li>\n<li>Communication overhead of data quality metrics, typically in a big organization, which involve different teams.</li>\n</ol>\n<p>The idea of Apache Apache Griffin is to provide Data Quality validation as a Service, to allow data engineers and data consumers to have:</p>\n<ul>\n<li>Near real-time understanding of the data quality health of your data pipelines with end-to-end monitoring, all in one place.</li>\n<li>Profiling, detecting and correlating issues and providing recommendations that drive rapid and focused troubleshooting</li>\n<li>A centralized data quality model management system including rule, metadata, scheduler etc. </li>\n<li>Native code generation to run everywhere, including Hadoop, Kafka, Spark, etc.</li>\n<li>One set of tools to build data quality pipelines across all eBay data platforms.</li>\n</ul>\n"}],"PostAsset":[],"PostCategory":[],"PostTag":[],"Tag":[]}} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/blob/c8612f14/source/_posts/hello-world.md ---------------------------------------------------------------------- diff --git a/source/_posts/hello-world.md b/source/_posts/hello-world.md index c090297..af123e6 100644 --- a/source/_posts/hello-world.md +++ b/source/_posts/hello-world.md @@ -1,38 +1,71 @@ --- -title: Hello World +title: Apache Griffin --- -Welcome to [Hexo](https://hexo.io/)! This is your very first post. Check [documentation](https://hexo.io/docs/) for more info. If you get any problems when using Hexo, you can find the answer in [troubleshooting](https://hexo.io/docs/troubleshooting.html) or you can ask me on [GitHub](https://github.com/hexojs/hexo/issues). -## Quick Start +## Abstract +Apache Griffin is a Data Quality Service platform built on Apache Hadoop and Apache Spark. It provides a framework process for defining data quality model, executing data quality measurement, automating data profiling and validation, as well as a unified data quality visualization across multiple data systems. It tries to address the data quality challenges in big data and streaming context. -### Create a new post -``` bash -$ hexo new "My New Post" -``` +## Overview of Apache Griffin +At eBay, when people use big data (Hadoop or other streaming systems), measurement of data quality is a big challenge. Different teams have built customized tools to detect and analyze data quality issues within their own domains. As a platform organization, we think of taking a platform approach to commonly occurring patterns. As such, we are building a platform to provide shared Infrastructure and generic features to solve common data quality pain points. This would enable us to build trusted data assets. -More info: [Writing](https://hexo.io/docs/writing.html) +Currently it is very difficult and costly to do data quality validation when we have large volumes of related data flowing across multi-platforms (streaming and batch). Take eBay's Real-time Personalization Platform as a sample; Everyday we have to validate the data quality for ~600M records. Data quality often becomes one big challenge in this complex environment and massive scale. -### Run server +We detect the following at eBay: -``` bash -$ hexo server -``` +1. Lack of an end-to-end, unified view of data quality from multiple data sources to target applications that takes into account the lineage of the data. This results in a long time to identify and fix data quality issues. +2. Lack of a system to measure data quality in streaming mode through self-service. The need is for a system where datasets can be registered, data quality models can be defined, data quality can be visualized and monitored using a simple tool and teams alerted when an issue is detected. +3. Lack of a Shared platform and API Service. Every team should not have to apply and manage own hardware and software infrastructure to solve this common problem. -More info: [Server](https://hexo.io/docs/server.html) +With these in mind, we decided to build Apache Griffin - A data quality service that aims to solve the above short-comings. -### Generate static files +Apache Griffin includes: -``` bash -$ hexo generate -``` +**Data Quality Model Engine**: Apache Griffin is model driven solution, user can choose various data quality dimension to execute his/her data quality validation based on selected target data-set or source data-set ( as the golden reference data). It has corresponding library supporting it in back-end for the following measurement: -More info: [Generating](https://hexo.io/docs/generating.html) + - Accuracy - Does data reflect the real-world objects or a verifiable source + - Completeness - Is all necessary data present + - Validity - Are all data values within the data domains specified by the business + - Timeliness - Is the data available at the time needed + - Anomaly detection - Pre-built algorithm functions for the identification of items, events or observations which do not conform to an expected pattern or other items in a dataset + - Data Profiling - Apply statistical analysis and assessment of data values within a dataset for consistency, uniqueness and logic. -### Deploy to remote sites +**Data Collection Layer**: -``` bash -$ hexo deploy -``` +We support two kinds of data sources, batch data and real time data. -More info: [Deployment](https://hexo.io/docs/deployment.html) +For batch mode, we can collect data source from our Hadoop platform by various data connectors. + +For real time mode, we can connect with messaging system like Kafka to near real time analysis. + +**Data Process and Storage Layer**: + +For batch analysis, our data quality model will compute data quality metrics in our spark cluster based on data source in hadoop. + +For near real time analysis, we consume data from messaging system, then our data quality model will compute our real time data quality metrics in our spark cluster. for data storage, we use time series database in our back end to fulfill front end request. + +**Apache Griffin Service**: + +We have RESTful web services to accomplish all the functionalities of Apache Griffin, such as register data-set, create data quality model, publish metrics, retrieve metrics, add subscription, etc. So, the developers can develop their own user interface based on these web serivces. + +## Main business process +Here's the business process diagram + + + +## Rationale +The challenge we face at eBay is that our data volume is becoming bigger and bigger, systems process become more complex, while we do not have a unified data quality solution to ensure the trusted data sets which provide confidences on data quality to our data consumers. The key challenges on data quality includes: + +1. Existing commercial data quality solution cannot address data quality lineage among systems, cannot scale out to support fast growing data at eBay +2. Existing eBay's domain specific tools take a long time to identify and fix poor data quality when data flowed through multiple systems +3. Business logic becomes complex, requires data quality system much flexible. +4. Some data quality issues do have business impact on user experiences, revenue, efficiency & compliance. +5. Communication overhead of data quality metrics, typically in a big organization, which involve different teams. + +The idea of Apache Apache Griffin is to provide Data Quality validation as a Service, to allow data engineers and data consumers to have: + + - Near real-time understanding of the data quality health of your data pipelines with end-to-end monitoring, all in one place. + - Profiling, detecting and correlating issues and providing recommendations that drive rapid and focused troubleshooting + - A centralized data quality model management system including rule, metadata, scheduler etc. + - Native code generation to run everywhere, including Hadoop, Kafka, Spark, etc. + - One set of tools to build data quality pipelines across all eBay data platforms. http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/blob/c8612f14/source/_posts/hello.md ---------------------------------------------------------------------- diff --git a/source/_posts/hello.md b/source/_posts/hello.md deleted file mode 100644 index d65ea91..0000000 --- a/source/_posts/hello.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: hello -date: 2017-03-20 10:20:14 -tags: ---- http://git-wip-us.apache.org/repos/asf/incubator-griffin-site/blob/c8612f14/source/images/Business_Process.png ---------------------------------------------------------------------- diff --git a/source/images/Business_Process.png b/source/images/Business_Process.png new file mode 100644 index 0000000..ff0f25f Binary files /dev/null and b/source/images/Business_Process.png differ
