{"id":78190,"name":null,"description":"Data processing for and with foundation models!  🍎 🍋 🌽 ➡️ ➡️🍸 🍹 🍷","url":"https://github.com/modelscope/data-juicer","last_synced_at":"2025-09-08T21:17:27.648Z","repository":{"id":185316735,"uuid":"673277958","full_name":"modelscope/data-juicer","owner":"modelscope","description":"Data processing for and with foundation models!  🍎 🍋 🌽 ➡️ ➡️🍸 🍹 🍷","archived":false,"fork":false,"pushed_at":"2025-09-05T07:53:04.000Z","size":475048,"stargazers_count":5142,"open_issues_count":68,"forks_count":267,"subscribers_count":20,"default_branch":"main","last_synced_at":"2025-09-07T21:42:32.456Z","etag":null,"topics":["data","data-analysis","data-pipeline","data-processing","data-science","data-visualization","foundation-models","instruction-tuning","large-language-models","llm","llms","multi-modal","pre-training","synthetic-data"],"latest_commit_sha":null,"homepage":"https://modelscope.github.io/data-juicer/","language":"Python","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"apache-2.0","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/modelscope.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":"LICENSE","code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null,"zenodo":null,"notice":null,"maintainers":null,"copyright":null,"agents":null,"dco":null,"cla":null}},"created_at":"2023-08-01T09:16:41.000Z","updated_at":"2025-09-07T14:05:12.000Z","dependencies_parsed_at":null,"dependency_job_id":"b3dfe45b-7817-4c78-b9b4-06c1e0fd1372","html_url":"https://github.com/modelscope/data-juicer","commit_stats":{"total_commits":223,"total_committers":28,"mean_commits":7.964285714285714,"dds":0.7713004484304933,"last_synced_commit":"8e9b4c0b21d099fb950e74e6e176a5f730dd39eb"},"previous_names":["alibaba/data-juicer","modelscope/data-juicer"],"tags_count":20,"template":false,"template_full_name":null,"purl":"pkg:github/modelscope/data-juicer","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/modelscope%2Fdata-juicer","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/modelscope%2Fdata-juicer/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/modelscope%2Fdata-juicer/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/modelscope%2Fdata-juicer/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/modelscope","download_url":"https://codeload.github.com/modelscope/data-juicer/tar.gz/refs/heads/main","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/modelscope%2Fdata-juicer/sbom","scorecard":null,"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":274231506,"owners_count":25245625,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","status":"online","status_checked_at":"2025-09-08T02:00:09.813Z","response_time":121,"last_error":null,"robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":true,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"owner":{"login":"modelscope","name":"ModelScope","uuid":"109945100","kind":"organization","description":"Model-as-a-Service in the making: bring accessible AI to all.","email":"contact@modelscope.cn","website":"https://www.modelscope.cn/","location":null,"twitter":null,"company":null,"icon_url":"https://avatars.githubusercontent.com/u/109945100?v=4","repositories_count":3,"last_synced_at":"2023-04-03T10:08:13.111Z","metadata":{"has_sponsors_listing":false},"html_url":"https://github.com/modelscope","funding_links":[],"total_stars":null,"followers":null,"following":null,"created_at":"2023-04-03T10:08:13.157Z","updated_at":"2023-04-03T10:08:13.157Z","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/modelscope","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/modelscope/repositories"},"packages":[{"id":8279458,"name":"py-data-juicer","ecosystem":"pypi","description":"Data Processing for and with Foundation Models.","homepage":null,"licenses":"Apache-2.0","normalized_licenses":["Apache-2.0"],"repository_url":"https://github.com/modelscope/data-juicer","keywords_array":[],"namespace":null,"versions_count":21,"first_release_published_at":"2023-09-15T02:19:28.000Z","latest_release_published_at":"2025-08-18T03:25:25.000Z","latest_release_number":"1.4.2","last_synced_at":"2025-09-07T21:36:29.180Z","created_at":"2023-09-15T02:32:14.860Z","updated_at":"2025-09-07T21:36:29.180Z","registry_url":"https://pypi.org/project/py-data-juicer/","install_command":"pip install py-data-juicer --index-url https://pypi.org/simple","documentation_url":"https://py-data-juicer.readthedocs.io/","metadata":{"funding":null,"documentation":null,"classifiers":["License :: OSI Approved :: Apache Software License","Operating System :: OS Independent","Programming Language :: Python :: 3"],"normalized_name":"py-data-juicer","project_status":null},"repo_metadata":{"id":185316735,"uuid":"673277958","full_name":"modelscope/data-juicer","owner":"modelscope","description":"A one-stop data processing system to make data higher-quality, juicier, and more digestible for LLMs!  🍎 🍋 🌽 ➡️ ➡️🍸 🍹 🍷为大语言模型提供更高质量、更丰富、更易”消化“的数据！","archived":false,"fork":false,"pushed_at":"2024-03-21T02:15:39.000Z","size":34272,"stargazers_count":1321,"open_issues_count":12,"forks_count":78,"subscribers_count":13,"default_branch":"main","last_synced_at":"2024-03-21T03:27:04.766Z","etag":null,"topics":["chinese","data-analysis","data-science","data-visualization","dataset","gpt","gpt-4","instruction-tuning","large-language-models","llama","llava","llm","llms","multi-modal","nlp","opendata","pre-training","pytorch","sora","streamlit"],"latest_commit_sha":null,"homepage":"","language":"Python","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"apache-2.0","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/modelscope.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":"LICENSE","code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null}},"created_at":"2023-08-01T09:16:41.000Z","updated_at":"2024-03-21T03:27:14.284Z","dependencies_parsed_at":null,"dependency_job_id":"b3dfe45b-7817-4c78-b9b4-06c1e0fd1372","html_url":"https://github.com/modelscope/data-juicer","commit_stats":{"total_commits":63,"total_committers":13,"mean_commits":4.846153846153846,"dds":0.6349206349206349,"last_synced_commit":"f2999866361fd181a21bf580c3e31a0689f74941"},"previous_names":["alibaba/data-juicer","modelscope/data-juicer"],"tags_count":4,"template":false,"template_full_name":null,"repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/modelscope%2Fdata-juicer","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/modelscope%2Fdata-juicer/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/modelscope%2Fdata-juicer/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/modelscope%2Fdata-juicer/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/modelscope","download_url":"https://codeload.github.com/modelscope/data-juicer/tar.gz/refs/heads/main","host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":214662657,"owners_count":15766250,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"},"owner_record":{"login":"modelscope","name":"ModelScope","uuid":"109945100","kind":"organization","description":"Model-as-a-Service in the making: bring accessible AI to all.","email":"contact@modelscope.cn","website":"https://www.modelscope.cn/","location":null,"twitter":null,"company":null,"icon_url":"https://avatars.githubusercontent.com/u/109945100?v=4","repositories_count":3,"last_synced_at":"2023-04-03T10:08:13.111Z","metadata":{"has_sponsors_listing":false},"html_url":"https://github.com/modelscope","funding_links":[],"total_stars":null,"followers":null,"following":null,"created_at":"2023-04-03T10:08:13.157Z","updated_at":"2023-04-03T10:08:13.157Z","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/modelscope","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/modelscope/repositories"},"tags":[{"name":"v0.2.0","sha":"156ed20acdac7c3fe02911bc9d98ecebd3ec5fb0","kind":"commit","published_at":"2024-03-07T12:23:25.000Z","download_url":"https://codeload.github.com/modelscope/data-juicer/tar.gz/v0.2.0","html_url":"https://github.com/modelscope/data-juicer/releases/tag/v0.2.0","dependencies_parsed_at":null,"dependency_job_id":null,"tag_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/modelscope%2Fdata-juicer/tags/v0.2.0","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/modelscope%2Fdata-juicer/tags/v0.2.0/manifests"},{"name":"v0.1.3","sha":"a3c8310bf0848e787fba5eca2373e69a25767fd7","kind":"commit","published_at":"2024-01-05T07:19:44.000Z","download_url":"https://codeload.github.com/modelscope/data-juicer/tar.gz/v0.1.3","html_url":"https://github.com/modelscope/data-juicer/releases/tag/v0.1.3","dependencies_parsed_at":null,"dependency_job_id":null,"tag_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/modelscope%2Fdata-juicer/tags/v0.1.3","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/modelscope%2Fdata-juicer/tags/v0.1.3/manifests"},{"name":"v0.1.2","sha":"5bd715d074d4b385a9eee9a1a670683e49362658","kind":"commit","published_at":"2023-09-28T04:05:19.000Z","download_url":"https://codeload.github.com/modelscope/data-juicer/tar.gz/v0.1.2","html_url":"https://github.com/modelscope/data-juicer/releases/tag/v0.1.2","dependencies_parsed_at":null,"dependency_job_id":null,"tag_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/modelscope%2Fdata-juicer/tags/v0.1.2","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/modelscope%2Fdata-juicer/tags/v0.1.2/manifests"},{"name":"v0.1.0","sha":"d4ab729b89d440d10e531e16302276504a074608","kind":"commit","published_at":"2023-08-10T11:19:05.000Z","download_url":"https://codeload.github.com/modelscope/data-juicer/tar.gz/v0.1.0","html_url":"https://github.com/modelscope/data-juicer/releases/tag/v0.1.0","dependencies_parsed_at":null,"dependency_job_id":null,"tag_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/modelscope%2Fdata-juicer/tags/v0.1.0","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/modelscope%2Fdata-juicer/tags/v0.1.0/manifests"}]},"repo_metadata_updated_at":"2024-08-09T12:23:35.259Z","dependent_packages_count":0,"downloads":1330,"downloads_period":"last-month","dependent_repos_count":0,"rankings":{"downloads":16.886596715869345,"dependent_repos_count":68.90986571625571,"dependent_packages_count":7.381897931361508,"stargazers_count":7.066541520525614,"forks_count":11.988384955752213,"docker_downloads_count":null,"average":22.446657367952877},"purl":"pkg:pypi/py-data-juicer","advisories":[],"docker_usage_url":"https://docker.ecosyste.ms/usage/pypi/py-data-juicer","docker_dependents_count":null,"docker_downloads_count":null,"usage_url":"https://repos.ecosyste.ms/usage/pypi/py-data-juicer","dependent_repositories_url":"https://repos.ecosyste.ms/api/v1/usage/pypi/py-data-juicer/dependencies","status":null,"funding_links":[],"critical":null,"issue_metadata":{"last_synced_at":"2024-07-25T15:36:29.864Z","issues_count":43,"pull_requests_count":57,"avg_time_to_close_issue":863284.023255814,"avg_time_to_close_pull_request":112377.33333333333,"issues_closed_count":43,"pull_requests_closed_count":57,"pull_request_authors_count":9,"issue_authors_count":19,"avg_comments_per_issue":2.744186046511628,"avg_comments_per_pull_request":0.2982456140350877,"merged_pull_requests_count":56,"bot_issues_count":0,"bot_pull_requests_count":0,"past_year_issues_count":43,"past_year_pull_requests_count":57,"past_year_avg_time_to_close_issue":863284.023255814,"past_year_avg_time_to_close_pull_request":112377.33333333333,"past_year_issues_closed_count":43,"past_year_pull_requests_closed_count":57,"past_year_pull_request_authors_count":9,"past_year_issue_authors_count":19,"past_year_avg_comments_per_issue":2.744186046511628,"past_year_avg_comments_per_pull_request":0.2982456140350877,"past_year_bot_issues_count":0,"past_year_bot_pull_requests_count":0,"past_year_merged_pull_requests_count":56,"issues_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/repositories/modelscope%2Fdata-juicer/issues","maintainers":[{"login":"HYLcool","count":39,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/HYLcool"},{"login":"zhijianma","count":12,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/zhijianma"},{"login":"yxdyc","count":11,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/yxdyc"},{"login":"chenhesen","count":6,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/chenhesen"},{"login":"pan-x-c","count":2,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/pan-x-c"},{"login":"xieyxclack","count":2,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/xieyxclack"},{"login":"drcege","count":1,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/drcege"}],"active_maintainers":[{"login":"HYLcool","count":36,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/HYLcool"},{"login":"zhijianma","count":11,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/zhijianma"},{"login":"yxdyc","count":9,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/yxdyc"},{"login":"chenhesen","count":5,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/chenhesen"},{"login":"pan-x-c","count":1,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/pan-x-c"}]},"versions_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/packages/py-data-juicer/versions","version_numbers_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/packages/py-data-juicer/version_numbers","dependent_packages_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/packages/py-data-juicer/dependent_packages","related_packages_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/packages/py-data-juicer/related_packages","maintainers":[{"uuid":"data-juicer","login":"data-juicer","name":null,"email":null,"url":null,"packages_count":3,"html_url":"https://pypi.org/user/data-juicer/","role":null,"created_at":"2023-09-18T19:18:12.665Z","updated_at":"2023-09-18T19:18:12.665Z","packages_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/maintainers/data-juicer/packages"}],"registry":{"name":"pypi.org","url":"https://pypi.org","ecosystem":"pypi","default":true,"packages_count":726284,"maintainers_count":308824,"namespaces_count":0,"keywords_count":238245,"github":"pypi","metadata":{"funded_packages_count":50579},"icon_url":"https://github.com/pypi.png","created_at":"2022-04-04T15:19:23.364Z","updated_at":"2025-09-08T05:32:36.857Z","packages_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/packages","maintainers_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/maintainers","namespaces_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/namespaces"}}],"commits":{"id":1640124,"full_name":"modelscope/data-juicer","default_branch":"main","committers":[{"name":"Yilun Huang","email":"lielin.hyl@alibaba-inc.com","login":"HYLcool","count":118},{"name":"BeachWang","email":"1400012807@pku.edu.cn","login":"BeachWang","count":48},{"name":"Daoyuan Chen","email":"67475544+yxdyc","login":"yxdyc","count":45},{"name":"Ce Ge (戈策)","email":"gece@foxmail.com","login":"drcege","count":35},{"name":"zhijianma","email":"zhijian.mzj@alibaba-inc.com","login":"zhijianma","count":30},{"name":"Cathy0908","email":"30484308+Cathy0908","login":"Cathy0908","count":20},{"name":"garyzhang99","email":"46197280+garyzhang99","login":"garyzhang99","count":16},{"name":"chenhesen","email":"hesen.chs@alibaba-inc.com","login":"chenhesen","count":12},{"name":"Xuchen Pan","email":"32844285+pan-x-c","login":"pan-x-c","count":11},{"name":"Cyrus Zhang","email":"cyrus.ylzhang@gmail.com","login":"cyruszhang","count":11},{"name":"co63oc","email":"co63oc","login":"co63oc","count":10},{"name":"Yuhan Liu","email":"30294295+liuyuhanalex","login":"liuyuhanalex","count":9},{"name":"cmgzn","email":"85746275+cmgzn","login":"cmgzn","count":8},{"name":"Zhen Qin","email":"zhenqincn@gmail.com","login":"zhenqincn","count":7},{"name":"chenyushuo","email":"297086016@qq.com","login":"chenyushuo","count":5},{"name":"Qirui-jiao","email":"156628817+Qirui-jiao","login":"Qirui-jiao","count":3},{"name":"lingzhq","email":"145309613+lingzhq","login":"lingzhq","count":3},{"name":"2108038773","email":"101000927+2108038773","login":"2108038773","count":2},{"name":"JamieYu","email":"yu_haojia@foxmail.com","login":"TobyJasper","count":2},{"name":"Yuexiang XIE","email":"yuexiang.xyx@alibaba-inc.com","login":"xieyxclack","count":2},{"name":"weijie","email":"34210233+shiweijiezero","login":"shiweijiezero","count":1},{"name":"simplaj","email":"39286060+simplaj","login":"simplaj","count":1},{"name":"seanzhang-zhichen","email":"74812416+seanzhang-zhichen","login":"seanzhang-zhichen","count":1},{"name":"ricksun2023","email":"128897743+ricksun2023","login":"ricksun2023","count":1},{"name":"panghu","email":"51791120+fanronghai","login":"fanronghai","count":1},{"name":"jackylee","email":"qcsd2011@gmail.com","login":"jackylee-ch","count":1},{"name":"Yanyi Liu","email":"wolfsonliu@163.com","login":"liuyanyi","count":1},{"name":"ShenQianli","email":"shenqianli@u.nus.edu","login":"ShenQianli","count":1},{"name":"Ruidong-X","email":"xuruidong@gmail.com","login":"xuruidong","count":1},{"name":"NuODaniel","email":"zhonghanjun@baidu.com","login":"danielhjz","count":1},{"name":"JONGHO LEE","email":"ljhljh0125@gmail.com","login":"JONGSKY","count":1},{"name":"HongCheng","email":"kwchenghong@gmail.com","login":"chg0901","count":1},{"name":"Alibaba OSS","email":"opensource@alibaba-inc.com","login":"alibaba-oss","count":1}],"total_commits":410,"total_committers":33,"total_bot_commits":0,"total_bot_committers":0,"mean_commits":12.424242424242424,"dds":0.7121951219512195,"past_year_committers":[{"name":"Yilun Huang","email":"lielin.hyl@alibaba-inc.com","login":"HYLcool","count":70},{"name":"BeachWang","email":"1400012807@pku.edu.cn","login":"BeachWang","count":26},{"name":"Daoyuan Chen","email":"67475544+yxdyc","login":"yxdyc","count":20},{"name":"Cathy0908","email":"30484308+Cathy0908","login":"Cathy0908","count":17},{"name":"Ce Ge (戈策)","email":"gece@foxmail.com","login":"drcege","count":17},{"name":"Cyrus Zhang","email":"cyrus.ylzhang@gmail.com","login":"cyruszhang","count":11},{"name":"Yuhan Liu","email":"30294295+liuyuhanalex","login":"liuyuhanalex","count":9},{"name":"co63oc","email":"co63oc","login":"co63oc","count":9},{"name":"cmgzn","email":"85746275+cmgzn","login":"cmgzn","count":8},{"name":"Zhen Qin","email":"zhenqincn@gmail.com","login":"zhenqincn","count":5},{"name":"chenyushuo","email":"297086016@qq.com","login":"chenyushuo","count":5},{"name":"Qirui-jiao","email":"156628817+Qirui-jiao","login":"Qirui-jiao","count":3},{"name":"Xuchen Pan","email":"32844285+pan-x-c","login":"pan-x-c","count":3},{"name":"2108038773","email":"101000927+2108038773","login":"2108038773","count":2},{"name":"JamieYu","email":"yu_haojia@foxmail.com","login":"TobyJasper","count":2},{"name":"garyzhang99","email":"46197280+garyzhang99","login":"garyzhang99","count":2},{"name":"lingzhq","email":"145309613+lingzhq","login":"lingzhq","count":2},{"name":"NuODaniel","email":"zhonghanjun@baidu.com","login":"danielhjz","count":1},{"name":"ShenQianli","email":"shenqianli@u.nus.edu","login":"ShenQianli","count":1},{"name":"jackylee","email":"qcsd2011@gmail.com","login":"jackylee-ch","count":1},{"name":"panghu","email":"51791120+fanronghai","login":"fanronghai","count":1},{"name":"ricksun2023","email":"128897743+ricksun2023","login":"ricksun2023","count":1},{"name":"zhijianma","email":"zhijian.mzj@alibaba-inc.com","login":"zhijianma","count":1}],"past_year_total_commits":217,"past_year_total_committers":23,"past_year_total_bot_commits":0,"past_year_total_bot_committers":0,"past_year_mean_commits":9.434782608695652,"past_year_dds":0.6774193548387097,"last_synced_at":"2025-08-11T07:44:32.693Z","last_synced_commit":"26a51305f3b6fb29dffc38b1d1f83c167787652e","created_at":"2024-07-25T15:35:12.426Z","updated_at":"2025-08-11T07:44:34.652Z","commits_url":"https://commits.ecosyste.ms/api/v1/hosts/GitHub/repositories/modelscope%2Fdata-juicer/commits","host":{"name":"GitHub","url":"https://github.com","kind":"github","last_synced_at":"2025-08-15T00:00:12.661Z","repositories_count":5478302,"commits_count":853215296,"contributors_count":31091500,"owners_count":906512,"icon_url":"https://github.com/github.png","host_url":"https://commits.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://commits.ecosyste.ms/api/v1/hosts/GitHub/repositories"}},"issues_stats":{"full_name":"modelscope/data-juicer","html_url":"https://github.com/modelscope/data-juicer","last_synced_at":"2025-09-08T03:01:45.979Z","status":"active","issues_count":172,"pull_requests_count":593,"avg_time_to_close_issue":2550368.51,"avg_time_to_close_pull_request":739570.8767967146,"issues_closed_count":100,"pull_requests_closed_count":485,"pull_request_authors_count":34,"issue_authors_count":114,"avg_comments_per_issue":1.8023255813953487,"avg_comments_per_pull_request":0.42327150084317033,"merged_pull_requests_count":433,"bot_issues_count":0,"bot_pull_requests_count":0,"past_year_issues_count":113,"past_year_pull_requests_count":420,"past_year_avg_time_to_close_issue":2230683.0384615385,"past_year_avg_time_to_close_pull_request":519346.3869047619,"past_year_issues_closed_count":52,"past_year_pull_requests_closed_count":336,"past_year_pull_request_authors_count":28,"past_year_issue_authors_count":83,"past_year_avg_comments_per_issue":1.1150442477876106,"past_year_avg_comments_per_pull_request":0.34285714285714286,"past_year_bot_issues_count":0,"past_year_bot_pull_requests_count":0,"past_year_merged_pull_requests_count":303,"created_at":"2024-07-25T15:35:42.858Z","updated_at":"2025-09-08T03:01:45.980Z","repository_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/repositories/modelscope%2Fdata-juicer","issues_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/repositories/modelscope%2Fdata-juicer/issues","issue_labels_count":{"question":78,"bug":43,"stale-issue":35,"enhancement":35,"dj:op":8,"dj:multimodal":5,"dj:dist":3,"priority:high":2,"good first issue":2,"documentation":2,"competition:BetterSynth":1,"dj:dataset":1,"dj:post-tuning":1,"environment":1,"dj:core":1,"help wanted":1},"pull_request_labels_count":{"enhancement":178,"documentation":98,"dj:op":91,"bug":89,"dj:multimodal":46,"dj:ci/cd":39,"dj:core":37,"dj:dist":25,"environment":23,"dj:efficiency":16,"dj:dataset":12,"dj:cookbook":12,"dj:post-tuning":8,"stale-pr":8,"priority:high":8,"agent":6,"good first issue":6,"invalid":4,"dj:tools":4,"duplicate":2,"dj:text":2,"dj:lite":2,"dj-ci/cd":1},"issue_author_associations_count":{"NONE":133,"COLLABORATOR":36,"CONTRIBUTOR":3},"pull_request_author_associations_count":{"COLLABORATOR":496,"CONTRIBUTOR":74,"NONE":22,"OWNER":1},"issue_authors":{"BeachWang":10,"yxdyc":10,"drcege":9,"HYLcool":6,"abchbx":4,"simplew2011":4,"javapythonphp":3,"charonkk":3,"HunterLG":3,"DietDietDiet":2,"wangpi26":2,"tian969":2,"baiyi-os":2,"Yang-QW":2,"xiafeng-nb":2,"Young-zj":2,"weiaicunzai":2,"butterbutterflies":2,"xiedeyantu":2,"echo-valor":2,"edc3000":2,"huangkaipeng4399":2,"ycwfs":2,"Fatima-0SA":2,"Tendo33":1,"heningsu":1,"user2311717757":1,"WangJunjie97":1,"xunmenglt":1,"coder4nlp":1,"laolv421":1,"gongysh2004":1,"SkyAndFly":1,"JunyuanLi0408":1,"spacegrass":1,"lh61500":1,"hengshan123":1,"Snow0111":1,"tiandidatongJLR":1,"ZHJ19970917":1,"monsieurzhang":1,"lingzhq":1,"hhhhsc701":1,"TendouArisu":1,"arturia-Xayah":1,"Notonion":1,"MLikeWater":1,"flyflypeng":1,"fengzx99":1,"qyx1121":1,"aruig666":1,"noforit":1,"MingdongHe":1,"wqdta":1,"DonaldRR":1,"ctgushiwei":1,"976311200":1,"calledice":1,"timturing":1,"cycychenyi":1,"mkzzz":1,"ken-arf":1,"pan-x-c":1,"tuninger":1,"moyans":1,"nihaoqingtuan":1,"zdbss1990":1,"yaun248":1,"strongcc":1,"hxdsdu":1,"Night-Quiet":1,"TobyJasper":1,"cist":1,"Cheendfdf":1,"HalcyonLiang":1,"finger1517":1,"Chain123":1,"kike-0304":1,"fanronghai":1,"hastaluegoph":1,"zytcharming":1,"luckystar1992":1,"ftgreat":1,"Rao-student":1,"ariexBear":1,"obj12":1,"Crazy-JY":1,"Cccccc0630":1,"lucasjinreal":1,"FailedNamed":1,"lilqz66":1,"Xingyu-Romantic":1,"stalwart0465":1,"angus-deepmentor":1,"ForeverNewLee":1,"sherrytonger":1,"awangzy":1,"serser":1,"zhenqincn":1,"yywhsgnd":1,"00jdLiu":1,"lihongxiacream":1,"promisecc":1,"BenWu11":1,"ckd0817":1,"SWEENEYHE":1,"xiaofanloveme":1,"koanho":1,"IvanDeng0":1,"ellie77ovo":1,"Reneea1":1,"HaleYang":1,"Kaimary":1,"Mr-lonely0":1},"pull_request_authors":{"HYLcool":121,"BeachWang":82,"drcege":62,"yxdyc":48,"Cathy0908":38,"liuyuhanalex":28,"cyruszhang":27,"garyzhang99":26,"cmgzn":23,"co63oc":19,"Qirui-jiao":17,"pan-x-c":16,"chenyushuo":12,"zhenqincn":11,"lingzhq":10,"2108038773":8,"zhijianma":8,"SYSUzhouting":5,"TobyJasper":4,"xiaokun-hadoop":4,"jackylee-ch":4,"ricksun2023":2,"Bat-Reality":2,"danielhjz":2,"coolderli":2,"chenhesen":2,"fanronghai":2,"Cccccc0630":2,"simplaj":1,"shiweijiezero":1,"ruokic":1,"ShenQianli":1,"ycwfs":1,"seanzhang-zhichen":1},"host":{"name":"GitHub","url":"https://github.com","kind":"github","last_synced_at":"2025-09-08T00:00:17.013Z","repositories_count":10170187,"issues_count":32015436,"pull_requests_count":100280336,"authors_count":10731543,"icon_url":"https://github.com/github.png","host_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/repositories","owners_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/owners","authors_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors"},"past_year_issue_labels_count":{"question":60,"bug":25,"enhancement":19,"stale-issue":8,"dj:op":3,"priority:high":1,"dj:multimodal":1,"dj:dataset":1,"dj:dist":1,"dj:post-tuning":1,"environment":1,"good first issue":1,"dj:core":1,"documentation":1},"past_year_pull_request_labels_count":{"enhancement":118,"documentation":72,"bug":63,"dj:op":61,"dj:ci/cd":39,"dj:core":37,"environment":23,"dj:multimodal":17,"dj:efficiency":16,"dj:dist":14,"dj:cookbook":12,"dj:dataset":9,"dj:post-tuning":8,"agent":6,"dj:tools":4,"invalid":4,"good first issue":4,"priority:high":3,"duplicate":2,"dj:text":2,"dj:lite":2,"dj-ci/cd":1},"past_year_issue_author_associations_count":{"NONE":94,"COLLABORATOR":17,"CONTRIBUTOR":2},"past_year_pull_request_author_associations_count":{"COLLABORATOR":342,"CONTRIBUTOR":63,"NONE":14,"OWNER":1},"past_year_issue_authors":{"BeachWang":6,"HYLcool":4,"abchbx":4,"yxdyc":3,"drcege":3,"HunterLG":3,"charonkk":3,"javapythonphp":3,"wangpi26":2,"DietDietDiet":2,"Young-zj":2,"baiyi-os":2,"butterbutterflies":2,"weiaicunzai":2,"xiedeyantu":2,"huangkaipeng4399":2,"edc3000":2,"serser":1,"SkyAndFly":1,"spacegrass":1,"stalwart0465":1,"strongcc":1,"zytcharming":1,"Reneea1":1,"Rao-student":1,"pan-x-c":1,"obj12":1,"nihaoqingtuan":1,"Night-Quiet":1,"monsieurzhang":1,"fanronghai":1,"zhenqincn":1,"zdbss1990":1,"yywhsgnd":1,"yaun248":1,"xunmenglt":1,"Xingyu-Romantic":1,"xiaofanloveme":1,"xiafeng-nb":1,"wqdta":1,"WangJunjie97":1,"user2311717757":1,"tuninger":1,"TobyJasper":1,"timturing":1,"tiandidatongJLR":1,"Tendo33":1,"SWEENEYHE":1,"FailedNamed":1,"ellie77ovo":1,"DonaldRR":1,"cycychenyi":1,"ctgushiwei":1,"Crazy-JY":1,"ckd0817":1,"cist":1,"Cheendfdf":1,"Cccccc0630":1,"calledice":1,"awangzy":1,"aruig666":1,"ariexBear":1,"angus-deepmentor":1,"976311200":1,"00jdLiu":1,"MLikeWater":1,"mkzzz":1,"luckystar1992":1,"ken-arf":1,"Kaimary":1,"JunyuanLi0408":1,"IvanDeng0":1,"hxdsdu":1,"hhhhsc701":1,"heningsu":1,"hengshan123":1,"hastaluegoph":1,"gongysh2004":1,"ftgreat":1,"ForeverNewLee":1,"finger1517":1,"fengzx99":1,"Fatima-0SA":1},"past_year_pull_request_authors":{"HYLcool":108,"BeachWang":52,"Cathy0908":33,"yxdyc":29,"liuyuhanalex":28,"drcege":28,"cyruszhang":27,"cmgzn":23,"co63oc":19,"chenyushuo":12,"lingzhq":9,"zhenqincn":8,"Qirui-jiao":6,"pan-x-c":6,"SYSUzhouting":4,"jackylee-ch":4,"xiaokun-hadoop":4,"coolderli":2,"ricksun2023":2,"Bat-Reality":2,"2108038773":2,"danielhjz":2,"zhijianma":2,"TobyJasper":2,"fanronghai":2,"Cccccc0630":2,"ShenQianli":1,"ruokic":1},"maintainers":[{"login":"HYLcool","count":127,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/HYLcool"},{"login":"BeachWang","count":92,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/BeachWang"},{"login":"drcege","count":71,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/drcege"},{"login":"yxdyc","count":58,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/yxdyc"},{"login":"Cathy0908","count":36,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/Cathy0908"},{"login":"cyruszhang","count":27,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/cyruszhang"},{"login":"garyzhang99","count":26,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/garyzhang99"},{"login":"cmgzn","count":23,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/cmgzn"},{"login":"pan-x-c","count":17,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/pan-x-c"},{"login":"Qirui-jiao","count":17,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/Qirui-jiao"},{"login":"chenyushuo","count":12,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/chenyushuo"},{"login":"lingzhq","count":9,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/lingzhq"},{"login":"zhijianma","count":8,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/zhijianma"},{"login":"SYSUzhouting","count":4,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/SYSUzhouting"},{"login":"Bat-Reality","count":2,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/Bat-Reality"},{"login":"chenhesen","count":2,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/chenhesen"},{"login":"ShenQianli","count":1,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/ShenQianli"},{"login":"ruokic","count":1,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/ruokic"}],"active_maintainers":[{"login":"HYLcool","count":112,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/HYLcool"},{"login":"BeachWang","count":58,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/BeachWang"},{"login":"Cathy0908","count":33,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/Cathy0908"},{"login":"yxdyc","count":32,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/yxdyc"},{"login":"drcege","count":31,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/drcege"},{"login":"cyruszhang","count":27,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/cyruszhang"},{"login":"cmgzn","count":23,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/cmgzn"},{"login":"chenyushuo","count":12,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/chenyushuo"},{"login":"lingzhq","count":9,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/lingzhq"},{"login":"pan-x-c","count":7,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/pan-x-c"},{"login":"Qirui-jiao","count":6,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/Qirui-jiao"},{"login":"SYSUzhouting","count":4,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/SYSUzhouting"},{"login":"Bat-Reality","count":2,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/Bat-Reality"},{"login":"zhijianma","count":2,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/zhijianma"},{"login":"ShenQianli","count":1,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/ShenQianli"},{"login":"ruokic","count":1,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/ruokic"}]},"events":{"total":{"CreateEvent":145,"ReleaseEvent":16,"IssuesEvent":163,"WatchEvent":2073,"DeleteEvent":173,"MemberEvent":6,"IssueCommentEvent":274,"PushEvent":1129,"PullRequestReviewEvent":477,"PullRequestReviewCommentEvent":353,"PullRequestEvent":359,"ForkEvent":93},"last_year":{"CreateEvent":145,"ReleaseEvent":16,"IssuesEvent":163,"WatchEvent":2073,"DeleteEvent":173,"MemberEvent":6,"IssueCommentEvent":274,"PushEvent":1129,"PullRequestReviewEvent":477,"PullRequestReviewCommentEvent":353,"PullRequestEvent":359,"ForkEvent":93}},"keywords":["data","data-analysis","data-pipeline","data-processing","data-science","data-visualization","foundation-models","instruction-tuning","large-language-models","llm","llms","multi-modal","pre-training","synthetic-data"],"dependencies":[],"score":19.248528514609006,"created_at":"2025-09-08T20:54:59.187Z","updated_at":"2025-10-07T08:27:58.958Z","avatar_url":"https://github.com/modelscope.png","language":"Python","category":null,"sub_category":null,"monthly_downloads":1330,"funding_links":[],"readme_doi_urls":[],"works":{},"citation_counts":{},"total_citations":0,"keywords_from_contributors":["transformer","graph-computation"],"project_url":"https://science.ecosyste.ms/api/v1/projects/78190","html_url":"https://science.ecosyste.ms/projects/78190"}