{"id":5089,"name":"curator","description":"Scalable data pre processing and curation toolkit for LLMs","url":"https://github.com/nvidia-nemo/curator","last_synced_at":"2025-09-04T18:27:56.778Z","repository":{"id":227968333,"uuid":"772255271","full_name":"NVIDIA-NeMo/Curator","owner":"NVIDIA-NeMo","description":"Scalable data pre processing and curation toolkit for LLMs","archived":false,"fork":false,"pushed_at":"2025-08-29T06:52:28.000Z","size":16832,"stargazers_count":1111,"open_issues_count":95,"forks_count":166,"subscribers_count":17,"default_branch":"main","last_synced_at":"2025-08-29T08:28:29.386Z","etag":null,"topics":["data","data-curation","data-prep","data-preparation","data-processing","data-processing-pipelines","data-quality","datacuration","datarecipes","deduplication","fast-data-processing","fine-tuning","large-language-models","large-scale-data-processing","llm","llm-data-quality","llmapps","python","semantic-deduplication"],"latest_commit_sha":null,"homepage":"","language":"Python","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"apache-2.0","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/NVIDIA-NeMo.png","metadata":{"files":{"readme":"README.md","changelog":"CHANGELOG.md","contributing":"CONTRIBUTING.md","funding":null,"license":"LICENSE","code_of_conduct":null,"threat_model":null,"audit":null,"citation":"CITATION.cff","codeowners":null,"security":"SECURITY.md","support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null,"zenodo":null}},"created_at":"2024-03-14T20:41:51.000Z","updated_at":"2025-08-29T06:06:46.000Z","dependencies_parsed_at":"2024-05-21T23:28:40.873Z","dependency_job_id":"38698fa3-e369-413e-a338-faecd83a5d64","html_url":"https://github.com/NVIDIA-NeMo/Curator","commit_stats":null,"previous_names":["nvidia/nemo-curator","nvidia-nemo/curator"],"tags_count":19,"template":false,"template_full_name":null,"purl":"pkg:github/NVIDIA-NeMo/Curator","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/NVIDIA-NeMo%2FCurator","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/NVIDIA-NeMo%2FCurator/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/NVIDIA-NeMo%2FCurator/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/NVIDIA-NeMo%2FCurator/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/NVIDIA-NeMo","download_url":"https://codeload.github.com/NVIDIA-NeMo/Curator/tar.gz/refs/heads/main","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/NVIDIA-NeMo%2FCurator/sbom","scorecard":null,"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":273651965,"owners_count":25144148,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","status":"online","status_checked_at":"2025-09-04T02:00:08.968Z","response_time":61,"last_error":null,"robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":true,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"owner":{"login":"NVIDIA-NeMo","name":"NVIDIA-NeMo","uuid":"213689629","kind":"organization","description":"","email":null,"website":null,"location":null,"twitter":null,"company":null,"icon_url":"https://avatars.githubusercontent.com/u/213689629?v=4","repositories_count":1,"last_synced_at":"2025-06-10T02:31:13.250Z","metadata":{"has_sponsors_listing":false},"html_url":"https://github.com/NVIDIA-NeMo","funding_links":[],"total_stars":3,"followers":13,"following":0,"created_at":"2025-06-10T02:31:13.275Z","updated_at":"2025-06-10T02:31:13.275Z","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/NVIDIA-NeMo","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/NVIDIA-NeMo/repositories"},"packages":[],"commits":{"message":"Repository syncing started."},"issues_stats":{"full_name":"NVIDIA-NeMo/Curator","html_url":"https://github.com/NVIDIA-NeMo/Curator","last_synced_at":"2025-09-04T18:20:55.941Z","status":null,"issues_count":38,"pull_requests_count":168,"avg_time_to_close_issue":14767617.2,"avg_time_to_close_pull_request":1084369.7254901961,"issues_closed_count":10,"pull_requests_closed_count":102,"pull_request_authors_count":25,"issue_authors_count":16,"avg_comments_per_issue":0.631578947368421,"avg_comments_per_pull_request":1.375,"merged_pull_requests_count":81,"bot_issues_count":0,"bot_pull_requests_count":1,"past_year_issues_count":36,"past_year_pull_requests_count":167,"past_year_avg_time_to_close_issue":8701995.25,"past_year_avg_time_to_close_pull_request":704851.5445544554,"past_year_issues_closed_count":8,"past_year_pull_requests_closed_count":101,"past_year_pull_request_authors_count":24,"past_year_issue_authors_count":15,"past_year_avg_comments_per_issue":0.5,"past_year_avg_comments_per_pull_request":1.3652694610778442,"past_year_bot_issues_count":0,"past_year_bot_pull_requests_count":1,"past_year_merged_pull_requests_count":81,"created_at":"2025-07-16T12:20:41.883Z","updated_at":"2025-09-04T18:20:55.942Z","repository_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/repositories/NVIDIA-NeMo%2FCurator","issues_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/repositories/NVIDIA-NeMo%2FCurator/issues","issue_labels_count":{"enhancement":13,"bug":10,"Stale":6,"jira":5,"documentation":2,"community-request":1,"Run CICD":1,"cherry-pick":1},"pull_request_labels_count":{"Run CICD":12,"cherry-pick":12,"ray-api":11,"r0.9.0":5,"Stale":5,"community-request":4,"gpuci":4,"r1.0.0":2,"documentation":1,"dependencies":1,"python":1},"issue_author_associations_count":{"CONTRIBUTOR":30,"NONE":8},"pull_request_author_associations_count":{"CONTRIBUTOR":140,"NONE":21,"COLLABORATOR":7},"issue_authors":{"sarahyurick":9,"VibhuJawa":6,"abhinavg4":4,"praateekmahajan":4,"sithape2025":2,"ayushdg":2,"chtruong814":2,"richardliaw":1,"ronjer30":1,"CharlieTruong":1,"bschifferer":1,"suiyoubi":1,"leekaimao":1,"miguelusque":1,"QuyAnh2005":1,"yodiaditya":1},"pull_request_authors":{"praateekmahajan":29,"chtruong814":28,"suiyoubi":17,"thomasdhc":16,"sarahyurick":15,"ayushdg":13,"abhinavg4":10,"lbliii":8,"Copilot":5,"huvunvidia":4,"VibhuJawa":3,"Maghoumi":3,"TsukiSama9292":3,"arhamm1":2,"karpnv":2,"robinhad":1,"pablo-garay":1,"ruchaa-apte":1,"ko3n1g":1,"dependabot[bot]":1,"miguelusque":1,"aadesoba-nv":1,"ryantwolf":1,"ronjer30":1,"aschilling-nv":1},"host":{"name":"GitHub","url":"https://github.com","kind":"github","last_synced_at":"2025-09-04T00:00:25.939Z","repositories_count":10081478,"issues_count":31331220,"pull_requests_count":96240505,"authors_count":10693617,"icon_url":"https://github.com/github.png","host_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/repositories","owners_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/owners","authors_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors"},"past_year_issue_labels_count":{"enhancement":11,"bug":10,"Stale":4,"jira":3,"documentation":2,"community-request":1,"Run CICD":1,"cherry-pick":1},"past_year_pull_request_labels_count":{"Run CICD":12,"cherry-pick":12,"ray-api":11,"r0.9.0":5,"community-request":4,"Stale":4,"gpuci":4,"r1.0.0":2,"documentation":1,"dependencies":1,"python":1},"past_year_issue_author_associations_count":{"CONTRIBUTOR":28,"NONE":8},"past_year_pull_request_author_associations_count":{"CONTRIBUTOR":139,"NONE":21,"COLLABORATOR":7},"past_year_issue_authors":{"sarahyurick":9,"VibhuJawa":6,"abhinavg4":4,"praateekmahajan":4,"chtruong814":2,"sithape2025":2,"ayushdg":1,"bschifferer":1,"CharlieTruong":1,"leekaimao":1,"QuyAnh2005":1,"richardliaw":1,"ronjer30":1,"suiyoubi":1,"yodiaditya":1},"past_year_pull_request_authors":{"praateekmahajan":29,"chtruong814":28,"suiyoubi":17,"thomasdhc":16,"sarahyurick":15,"ayushdg":13,"abhinavg4":10,"lbliii":8,"Copilot":5,"huvunvidia":4,"TsukiSama9292":3,"VibhuJawa":3,"Maghoumi":3,"arhamm1":2,"karpnv":2,"aadesoba-nv":1,"robinhad":1,"ryantwolf":1,"pablo-garay":1,"dependabot[bot]":1,"ko3n1g":1,"ruchaa-apte":1,"ronjer30":1,"aschilling-nv":1},"maintainers":[{"login":"praateekmahajan","count":4,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/praateekmahajan"},{"login":"chtruong814","count":3,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/chtruong814"}],"active_maintainers":[{"login":"praateekmahajan","count":4,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/praateekmahajan"},{"login":"chtruong814","count":3,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/chtruong814"}]},"events":null,"keywords":["data","data-curation","data-prep","data-preparation","data-processing","data-processing-pipelines","data-quality","datacuration","datarecipes","deduplication","fast-data-processing","fine-tuning","large-language-models","large-scale-data-processing","llm","llm-data-quality","llmapps","python","semantic-deduplication"],"dependencies":[{"ecosystem":"pypi","filepath":"setup.py","sha":null,"kind":"manifest","created_at":"2024-03-16T03:25:44.426Z","updated_at":"2024-03-16T03:25:44.426Z","repository_link":"https://github.com/NVIDIA-NeMo/Curator/blob/main/setup.py","dependencies":[{"id":16844828355,"package_name":"dask","ecosystem":"pypi","requirements":"*","direct":true,"kind":"runtime","optional":false}]},{"ecosystem":"actions","filepath":".github/workflows/test.yml","sha":null,"kind":"manifest","created_at":"2024-03-25T20:55:13.139Z","updated_at":"2024-03-25T20:55:13.139Z","repository_link":"https://github.com/NVIDIA-NeMo/Curator/blob/main/.github/workflows/test.yml","dependencies":[{"id":17037598039,"package_name":"actions/checkout","ecosystem":"actions","requirements":"v4","direct":true,"kind":"composite","optional":false},{"id":17037598040,"package_name":"actions/setup-python","ecosystem":"actions","requirements":"v5","direct":true,"kind":"composite","optional":false}]},{"ecosystem":"pypi","filepath":"pyproject.toml","sha":null,"kind":"manifest","created_at":"2024-03-25T20:55:13.445Z","updated_at":"2024-03-25T20:55:13.445Z","repository_link":"https://github.com/NVIDIA-NeMo/Curator/blob/main/pyproject.toml","dependencies":[]}],"score":null,"created_at":"2025-09-04T15:50:14.086Z","updated_at":"2025-10-07T08:05:16.254Z","avatar_url":"https://github.com/NVIDIA-NeMo.png","language":"Python","category":null,"sub_category":null,"monthly_downloads":0,"funding_links":[],"readme_doi_urls":[],"works":{},"citation_counts":{},"total_citations":0,"keywords_from_contributors":[],"project_url":"https://science.ecosyste.ms/api/v1/projects/5089","html_url":"https://science.ecosyste.ms/projects/5089","bibtex_url":"https://science.ecosyste.ms/projects/5089/export.bibtex","apalike_url":"https://science.ecosyste.ms/projects/5089/export.apalike"}