{"id":55174,"name":"datatrove","description":"Freeing data processing from scripting madness by providing a set of platform-agnostic customizable pipeline processing blocks.","url":"https://github.com/huggingface/datatrove","last_synced_at":"2025-09-05T13:48:16.122Z","repository":{"id":218113068,"uuid":"653623369","full_name":"huggingface/datatrove","owner":"huggingface","description":"Freeing data processing from scripting madness by providing a set of platform-agnostic customizable pipeline processing blocks.","archived":false,"fork":false,"pushed_at":"2025-08-26T13:06:41.000Z","size":34077,"stargazers_count":2562,"open_issues_count":83,"forks_count":205,"subscribers_count":47,"default_branch":"main","last_synced_at":"2025-08-28T22:18:24.471Z","etag":null,"topics":[],"latest_commit_sha":null,"homepage":"","language":"Python","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"apache-2.0","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/huggingface.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":"LICENSE","code_of_conduct":null,"threat_model":null,"audit":null,"citation":"CITATION.cff","codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null,"zenodo":null}},"created_at":"2023-06-14T12:05:28.000Z","updated_at":"2025-08-28T15:15:09.000Z","dependencies_parsed_at":"2024-02-17T16:30:40.456Z","dependency_job_id":"5a0f2b08-0b74-4244-a8bd-09e9fc40306b","html_url":"https://github.com/huggingface/datatrove","commit_stats":{"total_commits":419,"total_committers":38,"mean_commits":"11.026315789473685","dds":0.3317422434367542,"last_synced_commit":"371c014374dd1805e42fd58a8a91dcb1309abb2f"},"previous_names":["huggingface/datatrove"],"tags_count":5,"template":false,"template_full_name":null,"purl":"pkg:github/huggingface/datatrove","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/huggingface","download_url":"https://codeload.github.com/huggingface/datatrove/tar.gz/refs/heads/main","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/sbom","scorecard":null,"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":273767456,"owners_count":25164461,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","status":"online","status_checked_at":"2025-09-05T02:00:09.113Z","response_time":402,"last_error":null,"robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":true,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"owner":{"login":"huggingface","name":"Hugging Face","uuid":"25720743","kind":"organization","description":"The AI community building the future.","email":null,"website":"https://huggingface.co/","location":"NYC + Paris","twitter":"huggingface","company":null,"icon_url":"https://avatars.githubusercontent.com/u/25720743?v=4","repositories_count":344,"last_synced_at":"2025-08-28T17:09:23.402Z","metadata":{"has_sponsors_listing":false},"html_url":"https://github.com/huggingface","funding_links":[],"total_stars":581025,"followers":53217,"following":0,"created_at":"2022-11-02T16:28:23.192Z","updated_at":"2025-08-28T17:09:23.402Z","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/huggingface","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/huggingface/repositories"},"packages":[{"id":11565531,"name":"github.com/huggingface/datatrove","ecosystem":"go","description":null,"homepage":null,"licenses":"apache-2.0","normalized_licenses":["Apache-2.0"],"repository_url":"https://github.com/huggingface/datatrove","keywords_array":[],"namespace":null,"versions_count":6,"first_release_published_at":"2024-02-07T09:57:48.000Z","latest_release_published_at":"2025-08-07T18:53:44.000Z","latest_release_number":"v0.6.0","last_synced_at":"2025-08-29T01:58:58.249Z","created_at":"2025-04-19T07:30:14.128Z","updated_at":"2025-08-29T02:19:14.413Z","registry_url":"https://pkg.go.dev/github.com/huggingface/datatrove","install_command":"go get github.com/huggingface/datatrove","documentation_url":"https://pkg.go.dev/github.com/huggingface/datatrove#section-documentation","metadata":{},"repo_metadata":{"id":218113068,"uuid":"653623369","full_name":"huggingface/datatrove","owner":"huggingface","description":"Freeing data processing from scripting madness by providing a set of platform-agnostic customizable pipeline processing blocks.","archived":false,"fork":false,"pushed_at":"2025-08-26T13:06:41.000Z","size":34077,"stargazers_count":2562,"open_issues_count":83,"forks_count":205,"subscribers_count":47,"default_branch":"main","last_synced_at":"2025-08-28T22:18:24.471Z","etag":null,"topics":[],"latest_commit_sha":null,"homepage":"","language":"Python","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"apache-2.0","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/huggingface.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":"LICENSE","code_of_conduct":null,"threat_model":null,"audit":null,"citation":"CITATION.cff","codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null,"zenodo":null}},"created_at":"2023-06-14T12:05:28.000Z","updated_at":"2025-08-28T15:15:09.000Z","dependencies_parsed_at":"2024-02-17T16:30:40.456Z","dependency_job_id":"5a0f2b08-0b74-4244-a8bd-09e9fc40306b","html_url":"https://github.com/huggingface/datatrove","commit_stats":{"total_commits":419,"total_committers":38,"mean_commits":"11.026315789473685","dds":0.3317422434367542,"last_synced_commit":"371c014374dd1805e42fd58a8a91dcb1309abb2f"},"previous_names":["huggingface/datatrove"],"tags_count":5,"template":false,"template_full_name":null,"purl":"pkg:github/huggingface/datatrove","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/huggingface","download_url":"https://codeload.github.com/huggingface/datatrove/tar.gz/refs/heads/main","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/sbom","scorecard":null,"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":272610807,"owners_count":24964354,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","status":"online","status_checked_at":"2025-08-29T02:00:10.610Z","response_time":87,"last_error":null,"robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":true,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"},"owner_record":{"login":"huggingface","name":"Hugging Face","uuid":"25720743","kind":"organization","description":"The AI community building the future.","email":null,"website":"https://huggingface.co/","location":"NYC + Paris","twitter":"huggingface","company":null,"icon_url":"https://avatars.githubusercontent.com/u/25720743?v=4","repositories_count":344,"last_synced_at":"2025-08-28T17:09:23.402Z","metadata":{"has_sponsors_listing":false},"html_url":"https://github.com/huggingface","funding_links":[],"total_stars":581025,"followers":53217,"following":0,"created_at":"2022-11-02T16:28:23.192Z","updated_at":"2025-08-28T17:09:23.402Z","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/huggingface","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/huggingface/repositories"},"tags":[{"name":"v0.5.0","sha":"99206aaf6ea86ba39d05a831d532665ea612d686","kind":"commit","published_at":"2025-04-30T16:43:03.000Z","download_url":"https://codeload.github.com/huggingface/datatrove/tar.gz/v0.5.0","html_url":"https://github.com/huggingface/datatrove/releases/tag/v0.5.0","dependencies_parsed_at":null,"dependency_job_id":null,"purl":"pkg:github/huggingface/datatrove@v0.5.0","tag_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.5.0","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.5.0/manifests"},{"name":"v0.4.0","sha":"842b241c23bbd2aaa5c102a28a26b3c3a98589bb","kind":"commit","published_at":"2024-12-06T18:23:33.000Z","download_url":"https://codeload.github.com/huggingface/datatrove/tar.gz/v0.4.0","html_url":"https://github.com/huggingface/datatrove/releases/tag/v0.4.0","dependencies_parsed_at":null,"dependency_job_id":null,"purl":"pkg:github/huggingface/datatrove@v0.4.0","tag_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.4.0","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.4.0/manifests"},{"name":"v0.3.0","sha":"d95e0ee85d3ce3a376c46dfdbf22b0f23749b654","kind":"commit","published_at":"2024-08-28T15:36:23.000Z","download_url":"https://codeload.github.com/huggingface/datatrove/tar.gz/v0.3.0","html_url":"https://github.com/huggingface/datatrove/releases/tag/v0.3.0","dependencies_parsed_at":null,"dependency_job_id":null,"purl":"pkg:github/huggingface/datatrove@v0.3.0","tag_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.3.0","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.3.0/manifests"},{"name":"v0.2.0","sha":"6d06210c337b6b54dfc48bce44ac32316da84f86","kind":"commit","published_at":"2024-04-22T16:53:45.000Z","download_url":"https://codeload.github.com/huggingface/datatrove/tar.gz/v0.2.0","html_url":"https://github.com/huggingface/datatrove/releases/tag/v0.2.0","dependencies_parsed_at":null,"dependency_job_id":null,"purl":"pkg:github/huggingface/datatrove@v0.2.0","tag_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.2.0","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.2.0/manifests"},{"name":"v0.0.1","sha":"bd3c89a2cf65320d42593eb4ab7975cbea878143","kind":"tag","published_at":"2024-02-07T09:58:06.000Z","download_url":"https://codeload.github.com/huggingface/datatrove/tar.gz/v0.0.1","html_url":"https://github.com/huggingface/datatrove/releases/tag/v0.0.1","dependencies_parsed_at":null,"dependency_job_id":null,"purl":"pkg:github/huggingface/datatrove@v0.0.1","tag_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.0.1","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.0.1/manifests"}]},"repo_metadata_updated_at":"2025-08-29T02:19:14.412Z","dependent_packages_count":0,"downloads":null,"downloads_period":null,"dependent_repos_count":0,"rankings":{"downloads":null,"dependent_repos_count":5.942714174127788,"dependent_packages_count":5.5696718094475886,"stargazers_count":null,"forks_count":null,"docker_downloads_count":null,"average":5.756192991787689},"purl":"pkg:golang/github.com/huggingface/datatrove","advisories":[],"docker_usage_url":"https://docker.ecosyste.ms/usage/go/github.com/huggingface/datatrove","docker_dependents_count":null,"docker_downloads_count":null,"usage_url":"https://repos.ecosyste.ms/usage/go/github.com/huggingface/datatrove","dependent_repositories_url":"https://repos.ecosyste.ms/api/v1/usage/go/github.com/huggingface/datatrove/dependencies","status":null,"funding_links":[],"critical":null,"issue_metadata":{"last_synced_at":"2025-08-29T02:19:13.612Z","issues_count":86,"pull_requests_count":131,"avg_time_to_close_issue":3895737.906976744,"avg_time_to_close_pull_request":1246380.8661417323,"issues_closed_count":43,"pull_requests_closed_count":127,"pull_request_authors_count":38,"issue_authors_count":48,"avg_comments_per_issue":2.5,"avg_comments_per_pull_request":0.8854961832061069,"merged_pull_requests_count":114,"bot_issues_count":0,"bot_pull_requests_count":2,"past_year_issues_count":58,"past_year_pull_requests_count":46,"past_year_avg_time_to_close_issue":3863882.619047619,"past_year_avg_time_to_close_pull_request":2938278.976190476,"past_year_issues_closed_count":21,"past_year_pull_requests_closed_count":42,"past_year_pull_request_authors_count":25,"past_year_issue_authors_count":34,"past_year_avg_comments_per_issue":1.8448275862068966,"past_year_avg_comments_per_pull_request":1.065217391304348,"past_year_bot_issues_count":0,"past_year_bot_pull_requests_count":2,"past_year_merged_pull_requests_count":36,"issues_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/issues","maintainers":[{"login":"guipenedo","count":42,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/guipenedo"},{"login":"thomwolf","count":8,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/thomwolf"},{"login":"anton-l","count":2,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/anton-l"},{"login":"garrethlee","count":1,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/garrethlee"},{"login":"sayakpaul","count":1,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/sayakpaul"}],"active_maintainers":[{"login":"guipenedo","count":6,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/guipenedo"},{"login":"garrethlee","count":1,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/garrethlee"}]},"versions_url":"https://packages.ecosyste.ms/api/v1/registries/proxy.golang.org/packages/github.com%2Fhuggingface%2Fdatatrove/versions","version_numbers_url":"https://packages.ecosyste.ms/api/v1/registries/proxy.golang.org/packages/github.com%2Fhuggingface%2Fdatatrove/version_numbers","dependent_packages_url":"https://packages.ecosyste.ms/api/v1/registries/proxy.golang.org/packages/github.com%2Fhuggingface%2Fdatatrove/dependent_packages","related_packages_url":"https://packages.ecosyste.ms/api/v1/registries/proxy.golang.org/packages/github.com%2Fhuggingface%2Fdatatrove/related_packages","maintainers":[],"registry":{"name":"proxy.golang.org","url":"https://proxy.golang.org","ecosystem":"go","default":true,"packages_count":1952121,"maintainers_count":0,"namespaces_count":741275,"keywords_count":109185,"github":"golang","metadata":{"funded_packages_count":49011},"icon_url":"https://github.com/golang.png","created_at":"2022-04-04T15:19:22.939Z","updated_at":"2025-09-05T05:14:06.439Z","packages_url":"https://packages.ecosyste.ms/api/v1/registries/proxy.golang.org/packages","maintainers_url":"https://packages.ecosyste.ms/api/v1/registries/proxy.golang.org/maintainers","namespaces_url":"https://packages.ecosyste.ms/api/v1/registries/proxy.golang.org/namespaces"}},{"id":9772860,"name":"testing-datatrove","ecosystem":"pypi","description":"HuggingFace library to process and filter large amounts of webdata","homepage":null,"licenses":"Apache-2.0","normalized_licenses":["Apache-2.0"],"repository_url":"https://github.com/huggingface/datatrove","keywords_array":["data","machine","learning","processing"],"namespace":null,"versions_count":1,"first_release_published_at":"2024-04-22T14:27:24.000Z","latest_release_published_at":"2024-04-22T14:27:24.000Z","latest_release_number":"4.0.1","last_synced_at":"2024-12-09T14:59:21.644Z","created_at":"2024-04-22T14:31:26.690Z","updated_at":"2025-08-29T02:19:14.413Z","registry_url":"https://pypi.org/project/testing-datatrove/","install_command":"pip install testing-datatrove --index-url https://pypi.org/simple","documentation_url":"https://testing-datatrove.readthedocs.io/","metadata":{"funding":null,"documentation":null,"classifiers":["Intended Audience :: Developers","Intended Audience :: Education","Intended Audience :: Science/Research","License :: OSI Approved :: Apache Software License","Operating System :: OS Independent","Programming Language :: Python :: 3","Programming Language :: Python :: 3.10","Programming Language :: Python :: 3.11","Programming Language :: Python :: 3.12","Topic :: Scientific/Engineering :: Artificial Intelligence"],"normalized_name":"testing-datatrove"},"repo_metadata":{"id":218113068,"uuid":"653623369","full_name":"huggingface/datatrove","owner":"huggingface","description":"Freeing data processing from scripting madness by providing a set of platform-agnostic customizable pipeline processing blocks.","archived":false,"fork":false,"pushed_at":"2025-08-26T13:06:41.000Z","size":34077,"stargazers_count":2562,"open_issues_count":83,"forks_count":205,"subscribers_count":47,"default_branch":"main","last_synced_at":"2025-08-28T22:18:24.471Z","etag":null,"topics":[],"latest_commit_sha":null,"homepage":"","language":"Python","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"apache-2.0","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/huggingface.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":"LICENSE","code_of_conduct":null,"threat_model":null,"audit":null,"citation":"CITATION.cff","codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null,"zenodo":null}},"created_at":"2023-06-14T12:05:28.000Z","updated_at":"2025-08-28T15:15:09.000Z","dependencies_parsed_at":"2024-02-17T16:30:40.456Z","dependency_job_id":"5a0f2b08-0b74-4244-a8bd-09e9fc40306b","html_url":"https://github.com/huggingface/datatrove","commit_stats":{"total_commits":419,"total_committers":38,"mean_commits":"11.026315789473685","dds":0.3317422434367542,"last_synced_commit":"371c014374dd1805e42fd58a8a91dcb1309abb2f"},"previous_names":["huggingface/datatrove"],"tags_count":5,"template":false,"template_full_name":null,"purl":"pkg:github/huggingface/datatrove","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/huggingface","download_url":"https://codeload.github.com/huggingface/datatrove/tar.gz/refs/heads/main","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/sbom","scorecard":null,"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":272610807,"owners_count":24964354,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","status":"online","status_checked_at":"2025-08-29T02:00:10.610Z","response_time":87,"last_error":null,"robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":true,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"},"owner_record":{"login":"huggingface","name":"Hugging Face","uuid":"25720743","kind":"organization","description":"The AI community building the future.","email":null,"website":"https://huggingface.co/","location":"NYC + Paris","twitter":"huggingface","company":null,"icon_url":"https://avatars.githubusercontent.com/u/25720743?v=4","repositories_count":344,"last_synced_at":"2025-08-28T17:09:23.402Z","metadata":{"has_sponsors_listing":false},"html_url":"https://github.com/huggingface","funding_links":[],"total_stars":581025,"followers":53217,"following":0,"created_at":"2022-11-02T16:28:23.192Z","updated_at":"2025-08-28T17:09:23.402Z","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/huggingface","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/huggingface/repositories"},"tags":[{"name":"v0.5.0","sha":"99206aaf6ea86ba39d05a831d532665ea612d686","kind":"commit","published_at":"2025-04-30T16:43:03.000Z","download_url":"https://codeload.github.com/huggingface/datatrove/tar.gz/v0.5.0","html_url":"https://github.com/huggingface/datatrove/releases/tag/v0.5.0","dependencies_parsed_at":null,"dependency_job_id":null,"purl":"pkg:github/huggingface/datatrove@v0.5.0","tag_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.5.0","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.5.0/manifests"},{"name":"v0.4.0","sha":"842b241c23bbd2aaa5c102a28a26b3c3a98589bb","kind":"commit","published_at":"2024-12-06T18:23:33.000Z","download_url":"https://codeload.github.com/huggingface/datatrove/tar.gz/v0.4.0","html_url":"https://github.com/huggingface/datatrove/releases/tag/v0.4.0","dependencies_parsed_at":null,"dependency_job_id":null,"purl":"pkg:github/huggingface/datatrove@v0.4.0","tag_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.4.0","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.4.0/manifests"},{"name":"v0.3.0","sha":"d95e0ee85d3ce3a376c46dfdbf22b0f23749b654","kind":"commit","published_at":"2024-08-28T15:36:23.000Z","download_url":"https://codeload.github.com/huggingface/datatrove/tar.gz/v0.3.0","html_url":"https://github.com/huggingface/datatrove/releases/tag/v0.3.0","dependencies_parsed_at":null,"dependency_job_id":null,"purl":"pkg:github/huggingface/datatrove@v0.3.0","tag_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.3.0","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.3.0/manifests"},{"name":"v0.2.0","sha":"6d06210c337b6b54dfc48bce44ac32316da84f86","kind":"commit","published_at":"2024-04-22T16:53:45.000Z","download_url":"https://codeload.github.com/huggingface/datatrove/tar.gz/v0.2.0","html_url":"https://github.com/huggingface/datatrove/releases/tag/v0.2.0","dependencies_parsed_at":null,"dependency_job_id":null,"purl":"pkg:github/huggingface/datatrove@v0.2.0","tag_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.2.0","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.2.0/manifests"},{"name":"v0.0.1","sha":"bd3c89a2cf65320d42593eb4ab7975cbea878143","kind":"tag","published_at":"2024-02-07T09:58:06.000Z","download_url":"https://codeload.github.com/huggingface/datatrove/tar.gz/v0.0.1","html_url":"https://github.com/huggingface/datatrove/releases/tag/v0.0.1","dependencies_parsed_at":null,"dependency_job_id":null,"purl":"pkg:github/huggingface/datatrove@v0.0.1","tag_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.0.1","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.0.1/manifests"}]},"repo_metadata_updated_at":"2025-08-29T02:19:14.412Z","dependent_packages_count":0,"downloads":null,"downloads_period":"last-month","dependent_repos_count":0,"rankings":{"downloads":null,"dependent_repos_count":62.7176608053806,"dependent_packages_count":9.50570071301216,"stargazers_count":null,"forks_count":null,"docker_downloads_count":null,"average":36.11168075919638},"purl":"pkg:pypi/testing-datatrove","advisories":[],"docker_usage_url":"https://docker.ecosyste.ms/usage/pypi/testing-datatrove","docker_dependents_count":null,"docker_downloads_count":null,"usage_url":"https://repos.ecosyste.ms/usage/pypi/testing-datatrove","dependent_repositories_url":"https://repos.ecosyste.ms/api/v1/usage/pypi/testing-datatrove/dependencies","status":null,"funding_links":[],"critical":null,"issue_metadata":{"last_synced_at":"2025-08-29T02:19:13.612Z","issues_count":86,"pull_requests_count":131,"avg_time_to_close_issue":3895737.906976744,"avg_time_to_close_pull_request":1246380.8661417323,"issues_closed_count":43,"pull_requests_closed_count":127,"pull_request_authors_count":38,"issue_authors_count":48,"avg_comments_per_issue":2.5,"avg_comments_per_pull_request":0.8854961832061069,"merged_pull_requests_count":114,"bot_issues_count":0,"bot_pull_requests_count":2,"past_year_issues_count":58,"past_year_pull_requests_count":46,"past_year_avg_time_to_close_issue":3863882.619047619,"past_year_avg_time_to_close_pull_request":2938278.976190476,"past_year_issues_closed_count":21,"past_year_pull_requests_closed_count":42,"past_year_pull_request_authors_count":25,"past_year_issue_authors_count":34,"past_year_avg_comments_per_issue":1.8448275862068966,"past_year_avg_comments_per_pull_request":1.065217391304348,"past_year_bot_issues_count":0,"past_year_bot_pull_requests_count":2,"past_year_merged_pull_requests_count":36,"issues_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/issues","maintainers":[{"login":"guipenedo","count":42,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/guipenedo"},{"login":"thomwolf","count":8,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/thomwolf"},{"login":"anton-l","count":2,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/anton-l"},{"login":"garrethlee","count":1,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/garrethlee"},{"login":"sayakpaul","count":1,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/sayakpaul"}],"active_maintainers":[{"login":"guipenedo","count":6,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/guipenedo"},{"login":"garrethlee","count":1,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/garrethlee"}]},"versions_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/packages/testing-datatrove/versions","version_numbers_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/packages/testing-datatrove/version_numbers","dependent_packages_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/packages/testing-datatrove/dependent_packages","related_packages_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/packages/testing-datatrove/related_packages","maintainers":[{"uuid":"hynky","login":"hynky","name":null,"email":null,"url":null,"packages_count":8,"html_url":"https://pypi.org/user/hynky/","role":null,"created_at":"2024-04-22T14:31:28.269Z","updated_at":"2024-04-22T14:31:28.269Z","packages_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/maintainers/hynky/packages"}],"registry":{"name":"pypi.org","url":"https://pypi.org","ecosystem":"pypi","default":true,"packages_count":725151,"maintainers_count":308368,"namespaces_count":0,"keywords_count":238006,"github":"pypi","metadata":{"funded_packages_count":50519},"icon_url":"https://github.com/pypi.png","created_at":"2022-04-04T15:19:23.364Z","updated_at":"2025-09-05T05:40:58.860Z","packages_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/packages","maintainers_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/maintainers","namespaces_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/namespaces"}},{"id":8623963,"name":"datatrove","ecosystem":"pypi","description":"HuggingFace library to process and filter large amounts of webdata","homepage":null,"licenses":"Apache-2.0","normalized_licenses":["Apache-2.0"],"repository_url":"https://github.com/huggingface/datatrove","keywords_array":["data","machine","learning","processing"],"namespace":null,"versions_count":7,"first_release_published_at":"2023-12-06T12:11:12.000Z","latest_release_published_at":"2025-08-07T19:01:10.000Z","latest_release_number":"0.6.0","last_synced_at":"2025-08-29T01:58:57.485Z","created_at":"2023-12-06T12:16:37.297Z","updated_at":"2025-08-29T02:19:14.415Z","registry_url":"https://pypi.org/project/datatrove/","install_command":"pip install datatrove --index-url https://pypi.org/simple","documentation_url":"https://datatrove.readthedocs.io/","metadata":{"funding":null,"documentation":null,"classifiers":["Intended Audience :: Developers","Intended Audience :: Education","Intended Audience :: Science/Research","License :: OSI Approved :: Apache Software License","Operating System :: OS Independent","Programming Language :: Python :: 3","Programming Language :: Python :: 3.10","Programming Language :: Python :: 3.11","Programming Language :: Python :: 3.12","Topic :: Scientific/Engineering :: Artificial Intelligence"],"normalized_name":"datatrove","project_status":null},"repo_metadata":{"id":218113068,"uuid":"653623369","full_name":"huggingface/datatrove","owner":"huggingface","description":"Freeing data processing from scripting madness by providing a set of platform-agnostic customizable pipeline processing blocks.","archived":false,"fork":false,"pushed_at":"2025-08-26T13:06:41.000Z","size":34077,"stargazers_count":2562,"open_issues_count":83,"forks_count":205,"subscribers_count":47,"default_branch":"main","last_synced_at":"2025-08-28T22:18:24.471Z","etag":null,"topics":[],"latest_commit_sha":null,"homepage":"","language":"Python","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"apache-2.0","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/huggingface.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":"LICENSE","code_of_conduct":null,"threat_model":null,"audit":null,"citation":"CITATION.cff","codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null,"zenodo":null}},"created_at":"2023-06-14T12:05:28.000Z","updated_at":"2025-08-28T15:15:09.000Z","dependencies_parsed_at":"2024-02-17T16:30:40.456Z","dependency_job_id":"5a0f2b08-0b74-4244-a8bd-09e9fc40306b","html_url":"https://github.com/huggingface/datatrove","commit_stats":{"total_commits":419,"total_committers":38,"mean_commits":"11.026315789473685","dds":0.3317422434367542,"last_synced_commit":"371c014374dd1805e42fd58a8a91dcb1309abb2f"},"previous_names":["huggingface/datatrove"],"tags_count":5,"template":false,"template_full_name":null,"purl":"pkg:github/huggingface/datatrove","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/huggingface","download_url":"https://codeload.github.com/huggingface/datatrove/tar.gz/refs/heads/main","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/sbom","scorecard":null,"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":272610807,"owners_count":24964354,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","status":"online","status_checked_at":"2025-08-29T02:00:10.610Z","response_time":87,"last_error":null,"robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":true,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"},"owner_record":{"login":"huggingface","name":"Hugging Face","uuid":"25720743","kind":"organization","description":"The AI community building the future.","email":null,"website":"https://huggingface.co/","location":"NYC + Paris","twitter":"huggingface","company":null,"icon_url":"https://avatars.githubusercontent.com/u/25720743?v=4","repositories_count":344,"last_synced_at":"2025-08-28T17:09:23.402Z","metadata":{"has_sponsors_listing":false},"html_url":"https://github.com/huggingface","funding_links":[],"total_stars":581025,"followers":53217,"following":0,"created_at":"2022-11-02T16:28:23.192Z","updated_at":"2025-08-28T17:09:23.402Z","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/huggingface","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/huggingface/repositories"},"tags":[{"name":"v0.5.0","sha":"99206aaf6ea86ba39d05a831d532665ea612d686","kind":"commit","published_at":"2025-04-30T16:43:03.000Z","download_url":"https://codeload.github.com/huggingface/datatrove/tar.gz/v0.5.0","html_url":"https://github.com/huggingface/datatrove/releases/tag/v0.5.0","dependencies_parsed_at":null,"dependency_job_id":null,"purl":"pkg:github/huggingface/datatrove@v0.5.0","tag_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.5.0","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.5.0/manifests"},{"name":"v0.4.0","sha":"842b241c23bbd2aaa5c102a28a26b3c3a98589bb","kind":"commit","published_at":"2024-12-06T18:23:33.000Z","download_url":"https://codeload.github.com/huggingface/datatrove/tar.gz/v0.4.0","html_url":"https://github.com/huggingface/datatrove/releases/tag/v0.4.0","dependencies_parsed_at":null,"dependency_job_id":null,"purl":"pkg:github/huggingface/datatrove@v0.4.0","tag_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.4.0","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.4.0/manifests"},{"name":"v0.3.0","sha":"d95e0ee85d3ce3a376c46dfdbf22b0f23749b654","kind":"commit","published_at":"2024-08-28T15:36:23.000Z","download_url":"https://codeload.github.com/huggingface/datatrove/tar.gz/v0.3.0","html_url":"https://github.com/huggingface/datatrove/releases/tag/v0.3.0","dependencies_parsed_at":null,"dependency_job_id":null,"purl":"pkg:github/huggingface/datatrove@v0.3.0","tag_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.3.0","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.3.0/manifests"},{"name":"v0.2.0","sha":"6d06210c337b6b54dfc48bce44ac32316da84f86","kind":"commit","published_at":"2024-04-22T16:53:45.000Z","download_url":"https://codeload.github.com/huggingface/datatrove/tar.gz/v0.2.0","html_url":"https://github.com/huggingface/datatrove/releases/tag/v0.2.0","dependencies_parsed_at":null,"dependency_job_id":null,"purl":"pkg:github/huggingface/datatrove@v0.2.0","tag_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.2.0","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.2.0/manifests"},{"name":"v0.0.1","sha":"bd3c89a2cf65320d42593eb4ab7975cbea878143","kind":"tag","published_at":"2024-02-07T09:58:06.000Z","download_url":"https://codeload.github.com/huggingface/datatrove/tar.gz/v0.0.1","html_url":"https://github.com/huggingface/datatrove/releases/tag/v0.0.1","dependencies_parsed_at":null,"dependency_job_id":null,"purl":"pkg:github/huggingface/datatrove@v0.0.1","tag_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.0.1","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/tags/v0.0.1/manifests"}]},"repo_metadata_updated_at":"2025-08-29T02:19:14.415Z","dependent_packages_count":0,"downloads":16380,"downloads_period":"last-month","dependent_repos_count":0,"rankings":{"downloads":null,"dependent_repos_count":67.23217911346916,"dependent_packages_count":10.104988627979555,"stargazers_count":null,"forks_count":null,"docker_downloads_count":null,"average":38.668583870724355},"purl":"pkg:pypi/datatrove","advisories":[],"docker_usage_url":"https://docker.ecosyste.ms/usage/pypi/datatrove","docker_dependents_count":null,"docker_downloads_count":null,"usage_url":"https://repos.ecosyste.ms/usage/pypi/datatrove","dependent_repositories_url":"https://repos.ecosyste.ms/api/v1/usage/pypi/datatrove/dependencies","status":null,"funding_links":[],"critical":null,"issue_metadata":{"last_synced_at":"2025-08-29T02:19:13.612Z","issues_count":86,"pull_requests_count":131,"avg_time_to_close_issue":3895737.906976744,"avg_time_to_close_pull_request":1246380.8661417323,"issues_closed_count":43,"pull_requests_closed_count":127,"pull_request_authors_count":38,"issue_authors_count":48,"avg_comments_per_issue":2.5,"avg_comments_per_pull_request":0.8854961832061069,"merged_pull_requests_count":114,"bot_issues_count":0,"bot_pull_requests_count":2,"past_year_issues_count":58,"past_year_pull_requests_count":46,"past_year_avg_time_to_close_issue":3863882.619047619,"past_year_avg_time_to_close_pull_request":2938278.976190476,"past_year_issues_closed_count":21,"past_year_pull_requests_closed_count":42,"past_year_pull_request_authors_count":25,"past_year_issue_authors_count":34,"past_year_avg_comments_per_issue":1.8448275862068966,"past_year_avg_comments_per_pull_request":1.065217391304348,"past_year_bot_issues_count":0,"past_year_bot_pull_requests_count":2,"past_year_merged_pull_requests_count":36,"issues_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/issues","maintainers":[{"login":"guipenedo","count":42,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/guipenedo"},{"login":"thomwolf","count":8,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/thomwolf"},{"login":"anton-l","count":2,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/anton-l"},{"login":"garrethlee","count":1,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/garrethlee"},{"login":"sayakpaul","count":1,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/sayakpaul"}],"active_maintainers":[{"login":"guipenedo","count":6,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/guipenedo"},{"login":"garrethlee","count":1,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/garrethlee"}]},"versions_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/packages/datatrove/versions","version_numbers_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/packages/datatrove/version_numbers","dependent_packages_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/packages/datatrove/dependent_packages","related_packages_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/packages/datatrove/related_packages","maintainers":[{"uuid":"Thomwolf","login":"Thomwolf","name":null,"email":null,"url":null,"packages_count":16,"html_url":"https://pypi.org/user/Thomwolf/","role":null,"created_at":"2023-12-06T12:52:19.304Z","updated_at":"2023-12-06T12:52:19.304Z","packages_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/maintainers/Thomwolf/packages"},{"uuid":"guipenedo","login":"guipenedo","name":null,"email":null,"url":null,"packages_count":3,"html_url":"https://pypi.org/user/guipenedo/","role":null,"created_at":"2023-12-06T12:52:19.148Z","updated_at":"2023-12-06T12:52:19.148Z","packages_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/maintainers/guipenedo/packages"},{"uuid":"hynky","login":"hynky","name":null,"email":null,"url":null,"packages_count":8,"html_url":"https://pypi.org/user/hynky/","role":null,"created_at":"2024-05-23T19:22:52.844Z","updated_at":"2024-05-23T19:22:52.844Z","packages_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/maintainers/hynky/packages"}],"registry":{"name":"pypi.org","url":"https://pypi.org","ecosystem":"pypi","default":true,"packages_count":725151,"maintainers_count":308368,"namespaces_count":0,"keywords_count":238006,"github":"pypi","metadata":{"funded_packages_count":50519},"icon_url":"https://github.com/pypi.png","created_at":"2022-04-04T15:19:23.364Z","updated_at":"2025-09-05T05:40:58.860Z","packages_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/packages","maintainers_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/maintainers","namespaces_url":"https://packages.ecosyste.ms/api/v1/registries/pypi.org/namespaces"}}],"commits":{"id":1773501,"full_name":"huggingface/datatrove","default_branch":"main","committers":[{"name":"guipenedo","email":"nostrumg@gmail.com","login":"guipenedo","count":301},{"name":"alessandro cappelli","email":"alessandro@lighton.ai","login":"alexchapeaux","count":40},{"name":"Hynek Kydlíček","email":"kydlicek.hynek@gmail.com","login":"hynky1999","count":18},{"name":"Thomas Wolf","email":"thomas@huggingface.co","login":"thomwolf","count":14},{"name":"sungjun lee","email":"jun.untitled@kakaobrain.com","login":"justHungryMan","count":13},{"name":"Mario Šaško","email":"mariosasko777@gmail.com","login":"mariosasko","count":9},{"name":"anton-","email":"anton@huggingface.co","login":"anton-l","count":8},{"name":"Zehan Li","email":"69186130+jordane95","login":"jordane95","count":7},{"name":"Bram Vanroy","email":"2779410+BramVanroy","login":"BramVanroy","count":3},{"name":"Jay Lee","email":"sk8terbo2@gmail.com","login":"aiqwe","count":3},{"name":"shizhediao","email":"sdiaoaa@connect.ust.hk","login":"shizhediao","count":2},{"name":"its5Q","email":"leshaegoroov@gmail.com","login":"its5Q","count":2},{"name":"dependabot[bot]","email":"49699333+dependabot[bot]","login":"dependabot[bot]","count":2},{"name":"Tyler Thomas","email":"36181311+tylerjthomas9","login":"tylerjthomas9","count":2},{"name":"Qasid Saleem","email":"91096848+QasidSaleem","login":"QasidSaleem","count":2},{"name":"Nelson Liu","email":"nelson-liu","login":"nelson-liu","count":2},{"name":"Marianna","email":"43296932+marianna13","login":"marianna13","count":2},{"name":"Luc Georges","email":"McPatate","login":"McPatate","count":2},{"name":"Colin Raffel","email":"craffel@gmail.com","login":"craffel","count":2},{"name":"Antoni-Joan Solergibert","email":"74564958+TJ-Solergibert","login":"TJ-Solergibert","count":2},{"name":"0xh3x","email":"giorgi.jvaridze@gmail.com","login":"0xh3x","count":1},{"name":"vsabolcec","email":"60775189+vsabolcec","login":"vsabolcec","count":1},{"name":"sippycoder","email":"134823555+sippycoder","login":"sippycoder","count":1},{"name":"muzzynine","email":"muzzynine@gmail.com","login":"muzzynine","count":1},{"name":"kylematoba","email":"22180455+kylematoba","login":"kylematoba","count":1},{"name":"fierzdev","email":"20905106+fierzdev","login":"fierzdev","count":1},{"name":"beme248","email":"beme248","login":"beme248","count":1},{"name":"baggiponte","email":"57922983+baggiponte","login":"baggiponte","count":1},{"name":"Vivien Cabannes","email":"vivien.cabannes@gmail.com","login":"VivienCabannes","count":1},{"name":"Stephen Rebel","email":"75142837+StephenRebel","login":"StephenRebel","count":1},{"name":"Silver","email":"zhengyinhe1@163.com","login":"silverriver","count":1},{"name":"Sam Foreman","email":"saforem2@gmail.com","login":"saforem2","count":1},{"name":"Ran Tavory","email":"rantav@gmail.com","login":"rantav","count":1},{"name":"Perry Li","email":"462046122@qq.com","login":"Youggls","count":1},{"name":"Olga","email":"72709900+olga1988olga","login":"olga1988olga","count":1},{"name":"Nicholas L","email":"66135546+NicholasLindner","login":"NicholasLindner","count":1},{"name":"M. Tolga Cangöz","email":"46008593+standardAI","login":"standardAI","count":1},{"name":"Loubna Ben Allal","email":"44069155+loubnabnl","login":"loubnabnl","count":1},{"name":"Leandro von Werra","email":"lvwerra","login":"lvwerra","count":1},{"name":"LFu","email":"lyuwen","login":"lyuwen","count":1},{"name":"Jafar Isbarov","email":"60838378+ceferisbarov","login":"ceferisbarov","count":1},{"name":"Hüseyin ABANOZ","email":"huseyinabanozis@gmail.com","login":"habanoz","count":1},{"name":"Giorgio Angelotti","email":"76100950+giorgioangel","login":"giorgioangel","count":1},{"name":"Anacheron51","email":"49575933+Anacheron51","login":"Anacheron51","count":1},{"name":"Adrien Barbaresi","email":"adbar","login":"adbar","count":1}],"total_commits":461,"total_committers":45,"total_bot_commits":2,"total_bot_committers":1,"mean_commits":10.244444444444444,"dds":0.34707158351409984,"past_year_committers":[{"name":"guipenedo","email":"nostrumg@gmail.com","login":"guipenedo","count":55},{"name":"Hynek Kydlíček","email":"kydlicek.hynek@gmail.com","login":"hynky1999","count":12},{"name":"sungjun lee","email":"jun.untitled@kakaobrain.com","login":"justHungryMan","count":10},{"name":"Bram Vanroy","email":"2779410+BramVanroy","login":"BramVanroy","count":3},{"name":"Jay Lee","email":"sk8terbo2@gmail.com","login":"aiqwe","count":3},{"name":"shizhediao","email":"sdiaoaa@connect.ust.hk","login":"shizhediao","count":2},{"name":"its5Q","email":"leshaegoroov@gmail.com","login":"its5Q","count":2},{"name":"dependabot[bot]","email":"49699333+dependabot[bot]","login":"dependabot[bot]","count":2},{"name":"Zehan Li","email":"69186130+jordane95","login":"jordane95","count":2},{"name":"Tyler Thomas","email":"36181311+tylerjthomas9","login":"tylerjthomas9","count":2},{"name":"Nelson Liu","email":"nelson-liu","login":"nelson-liu","count":2},{"name":"Luc Georges","email":"McPatate","login":"McPatate","count":2},{"name":"Colin Raffel","email":"craffel@gmail.com","login":"craffel","count":2},{"name":"Antoni-Joan Solergibert","email":"74564958+TJ-Solergibert","login":"TJ-Solergibert","count":2},{"name":"Hüseyin ABANOZ","email":"huseyinabanozis@gmail.com","login":"habanoz","count":1},{"name":"Jafar Isbarov","email":"60838378+ceferisbarov","login":"ceferisbarov","count":1},{"name":"LFu","email":"lyuwen","login":"lyuwen","count":1},{"name":"Loubna Ben Allal","email":"44069155+loubnabnl","login":"loubnabnl","count":1},{"name":"Olga","email":"72709900+olga1988olga","login":"olga1988olga","count":1},{"name":"Perry Li","email":"462046122@qq.com","login":"Youggls","count":1},{"name":"Qasid Saleem","email":"91096848+QasidSaleem","login":"QasidSaleem","count":1},{"name":"Sam Foreman","email":"saforem2@gmail.com","login":"saforem2","count":1},{"name":"Silver","email":"zhengyinhe1@163.com","login":"silverriver","count":1},{"name":"Stephen Rebel","email":"75142837+StephenRebel","login":"StephenRebel","count":1},{"name":"Thomas Wolf","email":"thomwolf","login":"thomwolf","count":1},{"name":"Vivien Cabannes","email":"vivien.cabannes@gmail.com","login":"VivienCabannes","count":1},{"name":"beme248","email":"beme248","login":"beme248","count":1},{"name":"kylematoba","email":"22180455+kylematoba","login":"kylematoba","count":1},{"name":"muzzynine","email":"muzzynine@gmail.com","login":"muzzynine","count":1},{"name":"sippycoder","email":"134823555+sippycoder","login":"sippycoder","count":1},{"name":"vsabolcec","email":"60775189+vsabolcec","login":"vsabolcec","count":1}],"past_year_total_commits":118,"past_year_total_committers":31,"past_year_total_bot_commits":2,"past_year_total_bot_committers":1,"past_year_mean_commits":3.806451612903226,"past_year_dds":0.5338983050847458,"last_synced_at":"2025-05-14T06:25:02.148Z","last_synced_commit":"99206aaf6ea86ba39d05a831d532665ea612d686","created_at":"2024-09-20T16:07:46.149Z","updated_at":"2025-05-14T06:25:02.185Z","commits_url":"https://commits.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/commits","host":{"name":"GitHub","url":"https://github.com","kind":"github","last_synced_at":"2025-09-05T00:00:10.343Z","repositories_count":5480019,"commits_count":853389012,"contributors_count":31098138,"owners_count":906558,"icon_url":"https://github.com/github.png","host_url":"https://commits.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://commits.ecosyste.ms/api/v1/hosts/GitHub/repositories"}},"issues_stats":{"full_name":"huggingface/datatrove","html_url":"https://github.com/huggingface/datatrove","last_synced_at":"2025-08-31T07:46:37.640Z","status":null,"issues_count":88,"pull_requests_count":160,"avg_time_to_close_issue":4344185.780487805,"avg_time_to_close_pull_request":1451978.4276315789,"issues_closed_count":41,"pull_requests_closed_count":152,"pull_request_authors_count":41,"issue_authors_count":51,"avg_comments_per_issue":2.1477272727272725,"avg_comments_per_pull_request":0.8375,"merged_pull_requests_count":137,"bot_issues_count":0,"bot_pull_requests_count":4,"past_year_issues_count":53,"past_year_pull_requests_count":69,"past_year_avg_time_to_close_issue":3295004.8571428573,"past_year_avg_time_to_close_pull_request":2140793.7377049183,"past_year_issues_closed_count":14,"past_year_pull_requests_closed_count":61,"past_year_pull_request_authors_count":28,"past_year_issue_authors_count":34,"past_year_avg_comments_per_issue":0.9811320754716981,"past_year_avg_comments_per_pull_request":0.9130434782608695,"past_year_bot_issues_count":0,"past_year_bot_pull_requests_count":4,"past_year_merged_pull_requests_count":56,"created_at":"2024-09-20T16:07:47.180Z","updated_at":"2025-09-03T06:47:56.913Z","repository_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove","issues_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/repositories/huggingface%2Fdatatrove/issues","issue_labels_count":{"enhancement":5,"question":4,"bug":3},"pull_request_labels_count":{"dependencies":4,"rust":4},"issue_author_associations_count":{"NONE":107,"CONTRIBUTOR":51,"COLLABORATOR":1},"pull_request_author_associations_count":{"CONTRIBUTOR":116,"COLLABORATOR":81,"NONE":57,"MEMBER":15},"issue_authors":{"jordane95":31,"justHungryMan":8,"shizhediao":6,"stas00":6,"Manel-Hik":5,"rantav":4,"hynky1999":4,"nelson-liu":3,"BramVanroy":3,"jgcb00":3,"yongkangzhao":3,"canghaiyunfan":2,"aiqwe":2,"LeMoussel":2,"WenhaoZhang-Git":2,"eurethia":2,"simplew2011":2,"griff4692":2,"solene-evain":2,"axelmagn":2,"ShayDuane":2,"yjha9649":1,"nldxtd":1,"williamlin0518":1,"BakingBrains":1,"mcleish7":1,"Jeronymous":1,"SabFol":1,"marianna13":1,"aditya-hari":1,"frogeyedpeas":1,"jquesnelle":1,"ayushdg":1,"Anacheron51":1,"basma-b":1,"StephenRebelSSC":1,"H-Plus-Time":1,"marcopasqua":1,"srinjoym-cerebras":1,"hadim":1,"LymphV":1,"ryan-minato":1,"barneylogo":1,"amangup":1,"baon6052":1,"adbar":1,"parkwonjae":1,"c21":1,"hour":1,"silverriver":1,"prozzzzzz":1,"hnipun":1,"manuelbrack":1,"fakerybakery":1,"StephenRebel":1,"baggiponte":1,"UniverseFly":1,"nrv":1,"barney49":1,"guipenedo":1,"habanoz":1,"klarakaleb":1,"crisgarrillo":1,"ftgreat":1,"eltonjohnfanboy":1,"mrunesson":1,"dittops":1,"loganhart02":1,"staticpunch":1,"cryptowooser":1,"nicofirst1":1,"Maghoumi":1,"sippycoder":1,"akshayg08":1,"prasannapattam":1,"0xh3x":1,"hiennm15":1,"seralf":1,"its5Q":1,"elifssamplespace":1,"ql1235":1,"lfoppiano":1,"theyorubayesian":1,"mohataher":1},"pull_request_authors":{"guipenedo":79,"hynky1999":16,"jordane95":15,"alexchapeaux":15,"justHungryMan":13,"mariosasko":10,"thomwolf":10,"aiqwe":7,"BramVanroy":7,"fakerybakery":4,"nelson-liu":4,"dependabot[bot]":4,"silverriver":4,"marianna13":3,"TJ-Solergibert":3,"craffel":3,"VivienCabannes":2,"shizhediao":2,"EmanuelaBoros":2,"muzzynine":2,"ceferisbarov":2,"its5Q":2,"kylematoba":2,"NicholasLindner":2,"Tavish9":2,"frogeyedpeas":2,"anton-l":2,"Youggls":2,"0xh3x":2,"LeMoussel":2,"baggiponte":2,"olga1988olga":2,"WissamAntoun":2,"fierzdev":2,"habanoz":2,"xufeisofly":2,"garrethlee":2,"QasidSaleem":2,"saforem2":2,"loubnabnl":2,"McPatate":2,"Anacheron51":2,"adbar":1,"lvwerra":1,"StephenRebel":1,"manuelbrack":1,"tylerjthomas9":1,"davidbrandfonbrener":1,"sippycoder":1,"koalazf99":1,"jquesnelle":1,"standardAI":1,"rantav":1,"Olexandr88":1,"dipampaul17":1,"tolgacangoz":1,"lyuwen":1,"omahs":1,"giorgioangel":1,"vsabolcec":1,"stas00":1,"beme248":1},"host":{"name":"GitHub","url":"https://github.com","kind":"github","last_synced_at":"2025-09-05T00:00:10.444Z","repositories_count":10100117,"issues_count":31473266,"pull_requests_count":97445500,"authors_count":10702683,"icon_url":"https://github.com/github.png","host_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/repositories","owners_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/owners","authors_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors"},"past_year_issue_labels_count":{},"past_year_pull_request_labels_count":{"dependencies":4,"rust":4},"past_year_issue_author_associations_count":{"NONE":37,"CONTRIBUTOR":17},"past_year_pull_request_author_associations_count":{"CONTRIBUTOR":37,"NONE":24,"COLLABORATOR":14},"past_year_issue_authors":{"jordane95":9,"yongkangzhao":3,"shizhediao":3,"jgcb00":3,"ShayDuane":2,"nelson-liu":2,"LeMoussel":2,"justHungryMan":2,"BramVanroy":2,"aditya-hari":1,"yjha9649":1,"theyorubayesian":1,"srinjoym-cerebras":1,"silverriver":1,"amangup":1,"BakingBrains":1,"seralf":1,"SabFol":1,"ql1235":1,"prozzzzzz":1,"prasannapattam":1,"parkwonjae":1,"baon6052":1,"mcleish7":1,"Maghoumi":1,"LymphV":1,"barneylogo":1,"klarakaleb":1,"jquesnelle":1,"crisgarrillo":1,"frogeyedpeas":1,"hour":1,"hadim":1,"habanoz":1,"ftgreat":1},"past_year_pull_request_authors":{"guipenedo":12,"BramVanroy":5,"dependabot[bot]":4,"jordane95":4,"silverriver":4,"nelson-liu":4,"hynky1999":4,"craffel":3,"kylematoba":2,"aiqwe":2,"TJ-Solergibert":2,"frogeyedpeas":2,"Youggls":2,"saforem2":2,"LeMoussel":2,"xufeisofly":2,"WissamAntoun":2,"VivienCabannes":2,"habanoz":2,"ceferisbarov":2,"muzzynine":2,"garrethlee":2,"Tavish9":2,"dipampaul17":1,"Olexandr88":1,"jquesnelle":1,"lyuwen":1,"omahs":1},"maintainers":[{"login":"guipenedo","count":80,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/guipenedo"},{"login":"thomwolf","count":10,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/thomwolf"},{"login":"McPatate","count":2,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/McPatate"},{"login":"garrethlee","count":2,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/garrethlee"},{"login":"anton-l","count":2,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/anton-l"},{"login":"lvwerra","count":1,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/lvwerra"}],"active_maintainers":[{"login":"guipenedo","count":12,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/guipenedo"},{"login":"garrethlee","count":2,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/garrethlee"}]},"events":{"total":{"CreateEvent":17,"ReleaseEvent":2,"IssuesEvent":54,"WatchEvent":497,"DeleteEvent":24,"IssueCommentEvent":124,"PushEvent":172,"PullRequestReviewEvent":14,"PullRequestReviewCommentEvent":12,"PullRequestEvent":68,"ForkEvent":63},"last_year":{"CreateEvent":17,"ReleaseEvent":2,"IssuesEvent":54,"WatchEvent":497,"DeleteEvent":24,"IssueCommentEvent":124,"PushEvent":172,"PullRequestReviewEvent":14,"PullRequestReviewCommentEvent":12,"PullRequestEvent":68,"ForkEvent":63}},"keywords":[],"dependencies":[{"ecosystem":"actions","filepath":".github/workflows/ci.yml","sha":null,"kind":"manifest","created_at":"2024-01-19T20:34:41.320Z","updated_at":"2024-01-19T20:34:41.320Z","repository_link":"https://github.com/huggingface/datatrove/blob/main/.github/workflows/ci.yml","dependencies":[{"id":15710404386,"package_name":"actions/checkout","ecosystem":"actions","requirements":"v3","direct":true,"kind":"composite","optional":false},{"id":15710404389,"package_name":"actions/setup-python","ecosystem":"actions","requirements":"v4","direct":true,"kind":"composite","optional":false}]},{"ecosystem":"pypi","filepath":"pyproject.toml","sha":null,"kind":"manifest","created_at":"2024-01-19T20:34:41.533Z","updated_at":"2024-01-19T20:34:41.533Z","repository_link":"https://github.com/huggingface/datatrove/blob/main/pyproject.toml","dependencies":[]}],"score":21.391088324882983,"created_at":"2025-09-04T15:51:30.944Z","updated_at":"2025-10-07T08:21:47.517Z","avatar_url":"https://github.com/huggingface.png","language":"Python","category":null,"sub_category":null,"monthly_downloads":16380,"funding_links":[],"readme_doi_urls":[],"works":{},"citation_counts":{},"total_citations":0,"keywords_from_contributors":["transformer","jax","cryptocurrency","cryptography","language-model","distributed","interactive","optimizing-compiler","interpretability","optim"],"project_url":"https://science.ecosyste.ms/api/v1/projects/55174","html_url":"https://science.ecosyste.ms/projects/55174","bibtex_url":"https://science.ecosyste.ms/projects/55174/export.bibtex","apalike_url":"https://science.ecosyste.ms/projects/55174/export.apalike"}