{"id":146789,"name":null,"description":"Tools for managing datasets for governance and training.","url":"https://github.com/bigscience-workshop/data_tooling","last_synced_at":"2025-09-09T14:40:59.335Z","repository":{"id":39880587,"uuid":"377527862","full_name":"bigscience-workshop/data_tooling","owner":"bigscience-workshop","description":"Tools for managing datasets for governance and training.","archived":false,"fork":false,"pushed_at":"2025-08-11T17:15:03.000Z","size":228083,"stargazers_count":83,"open_issues_count":141,"forks_count":46,"subscribers_count":16,"default_branch":"master","last_synced_at":"2025-09-01T05:59:04.988Z","etag":null,"topics":[],"latest_commit_sha":null,"homepage":"","language":"HTML","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"apache-2.0","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/bigscience-workshop.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":"LICENSE","code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null,"zenodo":null}},"created_at":"2021-06-16T14:38:33.000Z","updated_at":"2025-08-29T21:59:52.000Z","dependencies_parsed_at":"2023-10-02T23:49:44.016Z","dependency_job_id":"e527079b-5653-4372-9fab-eba8d6282ca5","html_url":"https://github.com/bigscience-workshop/data_tooling","commit_stats":null,"previous_names":[],"tags_count":0,"template":false,"template_full_name":null,"purl":"pkg:github/bigscience-workshop/data_tooling","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/bigscience-workshop%2Fdata_tooling","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/bigscience-workshop%2Fdata_tooling/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/bigscience-workshop%2Fdata_tooling/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/bigscience-workshop%2Fdata_tooling/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/bigscience-workshop","download_url":"https://codeload.github.com/bigscience-workshop/data_tooling/tar.gz/refs/heads/master","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/bigscience-workshop%2Fdata_tooling/sbom","scorecard":null,"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":274314278,"owners_count":25262605,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","status":"online","status_checked_at":"2025-09-09T02:00:10.223Z","response_time":80,"last_error":null,"robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":true,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"owner":{"login":"bigscience-workshop","name":"BigScience Workshop","uuid":"82455566","kind":"organization","description":"Research workshop on large language models - The Summer of Language Models 21","email":"bigscience-contact@googlegroups.com","website":"https://bigscience.huggingface.co","location":null,"twitter":"BigScienceW","company":null,"icon_url":"https://avatars.githubusercontent.com/u/82455566?v=4","repositories_count":28,"last_synced_at":"2023-03-03T19:53:10.825Z","metadata":{"has_sponsors_listing":false},"html_url":"https://github.com/bigscience-workshop","funding_links":[],"total_stars":null,"followers":null,"following":null,"created_at":"2022-11-14T05:10:07.455Z","updated_at":"2023-03-03T19:53:10.861Z","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/bigscience-workshop","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/bigscience-workshop/repositories"},"packages":[],"commits":{"id":5057058,"full_name":"bigscience-workshop/data_tooling","default_branch":"master","committers":null,"total_commits":null,"total_committers":null,"total_bot_commits":null,"total_bot_committers":null,"mean_commits":null,"dds":null,"past_year_committers":null,"past_year_total_commits":null,"past_year_total_committers":null,"past_year_total_bot_commits":null,"past_year_total_bot_committers":null,"past_year_mean_commits":null,"past_year_dds":null,"last_synced_at":null,"last_synced_commit":null,"created_at":"2024-11-11T03:12:42.261Z","updated_at":"2024-11-11T03:12:42.261Z","commits_url":"https://commits.ecosyste.ms/api/v1/hosts/GitHub/repositories/bigscience-workshop%2Fdata_tooling/commits","host":{"name":"GitHub","url":"https://github.com","kind":"github","last_synced_at":"2025-09-09T00:17:06.873Z","repositories_count":5524511,"commits_count":868166479,"contributors_count":31942272,"owners_count":917070,"icon_url":"https://github.com/github.png","host_url":"https://commits.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://commits.ecosyste.ms/api/v1/hosts/GitHub/repositories"}},"issues_stats":{"full_name":"bigscience-workshop/data_tooling","html_url":"https://github.com/bigscience-workshop/data_tooling","last_synced_at":"2025-02-26T05:20:00.042Z","status":null,"issues_count":90,"pull_requests_count":97,"avg_time_to_close_issue":5182305.935483871,"avg_time_to_close_pull_request":283230.9893617021,"issues_closed_count":62,"pull_requests_closed_count":94,"pull_request_authors_count":30,"issue_authors_count":9,"avg_comments_per_issue":3.888888888888889,"avg_comments_per_pull_request":1.134020618556701,"merged_pull_requests_count":81,"bot_issues_count":0,"bot_pull_requests_count":10,"past_year_issues_count":0,"past_year_pull_requests_count":0,"past_year_avg_time_to_close_issue":null,"past_year_avg_time_to_close_pull_request":null,"past_year_issues_closed_count":0,"past_year_pull_requests_closed_count":0,"past_year_pull_request_authors_count":0,"past_year_issue_authors_count":0,"past_year_avg_comments_per_issue":null,"past_year_avg_comments_per_pull_request":null,"past_year_bot_issues_count":0,"past_year_bot_pull_requests_count":0,"past_year_merged_pull_requests_count":0,"created_at":"2024-11-11T03:12:44.119Z","updated_at":"2025-02-26T05:20:00.042Z","repository_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/repositories/bigscience-workshop%2Fdata_tooling","issues_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/repositories/bigscience-workshop%2Fdata_tooling/issues","issue_labels_count":{"data catalog":60,"language modeling script":34,"tooling":11,"need custodian permission":6,"need data sourcing feedback":6,"corpus":5,"duplicate":5,"wontfix":4,"good first issue":4,"metadata":3,"data format":3,"help wanted":2,"documentation":1,"evaluation":1,"filter":1,"tokenizer":1},"pull_request_labels_count":{"tooling":1},"issue_author_associations_count":{"MEMBER":64,"COLLABORATOR":20,"NONE":3,"CONTRIBUTOR":3},"pull_request_author_associations_count":{"CONTRIBUTOR":52,"COLLABORATOR":22,"MEMBER":19,"NONE":4},"issue_authors":{"albertvillanova":60,"huu4ontocord":20,"olinguyen":2,"ggdupont":2,"cccntu":2,"JoeyOhman":1,"asoroa":1,"mavela":1,"yuvalkirstain":1},"pull_request_authors":{"huu4ontocord":21,"SaulLu":12,"pre-commit-ci[bot]":10,"olinguyen":9,"edugp":4,"clancyoftheoverflow":3,"Luvata":3,"majauhar":3,"ianyu93":3,"ChenghaoMou":3,"Skylion007":2,"asoroa":2,"jtboing":2,"ggdupont":2,"paulovn":2,"thomasw21":2,"hbenyamina":1,"afaji":1,"albertvillanova":1,"abumafrim":1,"ruinunca":1,"HugoLaurencon":1,"chris-ha458":1,"omarespejel":1,"onadegibert":1,"reshinthadithyan":1,"sashavor":1,"heraclex12":1,"JetRunner":1,"manandey":1},"host":{"name":"GitHub","url":"https://github.com","kind":"github","last_synced_at":"2025-08-23T00:00:23.489Z","repositories_count":8452011,"issues_count":15870009,"pull_requests_count":37829448,"authors_count":5664630,"icon_url":"https://github.com/github.png","host_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/repositories","owners_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/owners","authors_url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors"},"past_year_issue_labels_count":{},"past_year_pull_request_labels_count":{},"past_year_issue_author_associations_count":{},"past_year_pull_request_author_associations_count":{},"past_year_issue_authors":{},"past_year_pull_request_authors":{},"maintainers":[{"login":"albertvillanova","count":61,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/albertvillanova"},{"login":"huu4ontocord","count":41,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/huu4ontocord"},{"login":"olinguyen","count":11,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/olinguyen"},{"login":"ggdupont","count":4,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/ggdupont"},{"login":"clancyoftheoverflow","count":3,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/clancyoftheoverflow"},{"login":"thomasw21","count":2,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/thomasw21"},{"login":"HugoLaurencon","count":1,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/HugoLaurencon"},{"login":"heraclex12","count":1,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/heraclex12"},{"login":"manandey","count":1,"url":"https://issues.ecosyste.ms/api/v1/hosts/GitHub/authors/manandey"}],"active_maintainers":[]},"events":{"total":{"WatchEvent":6,"PushEvent":9},"last_year":{"WatchEvent":6,"PushEvent":9}},"keywords":[],"dependencies":[{"ecosystem":"pypi","filepath":"cc_pseudo_crawl/python_scripts/extract_text/requirements.txt","sha":null,"kind":"manifest","created_at":"2022-09-07T15:12:54.750Z","updated_at":"2022-09-07T15:12:54.750Z","repository_link":"https://github.com/bigscience-workshop/data_tooling/blob/master/cc_pseudo_crawl/python_scripts/extract_text/requirements.txt","dependencies":[{"id":4115168144,"package_name":"git","ecosystem":"pypi","requirements":"*","direct":true,"kind":"runtime","optional":false}]},{"ecosystem":"pypi","filepath":"cc_pseudo_crawl/python_scripts/requirements.txt","sha":null,"kind":"manifest","created_at":"2022-09-07T15:12:54.955Z","updated_at":"2022-09-07T15:12:54.955Z","repository_link":"https://github.com/bigscience-workshop/data_tooling/blob/master/cc_pseudo_crawl/python_scripts/requirements.txt","dependencies":[{"id":4115169239,"package_name":"boto3","ecosystem":"pypi","requirements":"*","direct":true,"kind":"runtime","optional":false},{"id":4115169240,"package_name":"bs4","ecosystem":"pypi","requirements":"*","direct":true,"kind":"runtime","optional":false},{"id":4115169241,"package_name":"datasets","ecosystem":"pypi","requirements":"*","direct":true,"kind":"runtime","optional":false},{"id":4115169242,"package_name":"pyathena","ecosystem":"pypi","requirements":"*","direct":true,"kind":"runtime","optional":false},{"id":4115169243,"package_name":"surt","ecosystem":"pypi","requirements":"*","direct":true,"kind":"runtime","optional":false},{"id":4115169244,"package_name":"tldextract","ecosystem":"pypi","requirements":"*","direct":true,"kind":"runtime","optional":false},{"id":4115169245,"package_name":"warcio","ecosystem":"pypi","requirements":"*","direct":true,"kind":"runtime","optional":false}]},{"ecosystem":"pypi","filepath":"index_search/requirements.txt","sha":null,"kind":"manifest","created_at":"2022-09-07T15:12:55.100Z","updated_at":"2022-09-07T15:12:55.100Z","repository_link":"https://github.com/bigscience-workshop/data_tooling/blob/master/index_search/requirements.txt","dependencies":[{"id":4115169270,"package_name":"datasets","ecosystem":"pypi","requirements":"bigscience_datatooling","direct":true,"kind":"runtime","optional":false},{"id":4115169271,"package_name":"elasticsearch","ecosystem":"pypi","requirements":"==7.10.1","direct":true,"kind":"runtime","optional":false},{"id":4115169272,"package_name":"iso-639","ecosystem":"pypi","requirements":"==0.4.5","direct":true,"kind":"runtime","optional":false},{"id":4115169273,"package_name":"ray","ecosystem":"pypi","requirements":"*","direct":true,"kind":"runtime","optional":false},{"id":4115169274,"package_name":"simplejson","ecosystem":"pypi","requirements":"*","direct":true,"kind":"runtime","optional":false}]},{"ecosystem":"pypi","filepath":"kenlm_training/setup.py","sha":null,"kind":"manifest","created_at":"2022-09-07T15:12:55.177Z","updated_at":"2022-09-07T15:12:55.177Z","repository_link":"https://github.com/bigscience-workshop/data_tooling/blob/master/kenlm_training/setup.py","dependencies":[{"id":4115169330,"package_name":"beautifulsoup4","ecosystem":"pypi","requirements":"\u003e=4.7.1","direct":true,"kind":"runtime","optional":false},{"id":4115169331,"package_name":"pandas","ecosystem":"pypi","requirements":"\u003e=0.23.4","direct":true,"kind":"runtime","optional":false},{"id":4115169332,"package_name":"requests","ecosystem":"pypi","requirements":"\u003e=2.22.0","direct":true,"kind":"runtime","optional":false},{"id":4115169333,"package_name":"fasttext","ecosystem":"pypi","requirements":"\u003e=0.9.1","direct":true,"kind":"runtime","optional":false},{"id":4115169334,"package_name":"sentencepiece","ecosystem":"pypi","requirements":"\u003e=0.1.82","direct":true,"kind":"runtime","optional":false},{"id":4115169335,"package_name":"kenlm","ecosystem":"pypi","requirements":"*","direct":true,"kind":"runtime","optional":false},{"id":4115169336,"package_name":"func_argparse","ecosystem":"pypi","requirements":"\u003e=1.1.1","direct":true,"kind":"runtime","optional":false},{"id":4115169337,"package_name":"psutil","ecosystem":"pypi","requirements":"\u003e=5.6.3","direct":true,"kind":"runtime","optional":false},{"id":4115169338,"package_name":"sacremoses","ecosystem":"pypi","requirements":"*","direct":true,"kind":"runtime","optional":false},{"id":4115169339,"package_name":"submitit","ecosystem":"pypi","requirements":"\u003e=1.0.0","direct":true,"kind":"runtime","optional":false},{"id":4115169340,"package_name":"typing_extensions","ecosystem":"pypi","requirements":"*","direct":true,"kind":"runtime","optional":false},{"id":4115169341,"package_name":"datasets","ecosystem":"pypi","requirements":"==1.16.1","direct":true,"kind":"runtime","optional":false}]},{"ecosystem":"pypi","filepath":"perplexity_lenses/poetry.lock","sha":null,"kind":"lockfile","created_at":"2022-09-07T15:12:55.423Z","updated_at":"2022-09-07T15:12:55.423Z","repository_link":"https://github.com/bigscience-workshop/data_tooling/blob/master/perplexity_lenses/poetry.lock","dependencies":[{"id":4115170506,"package_name":"aiohttp","ecosystem":"pypi","requirements":"3.8.1","direct":false,"kind":"runtime","optional":false},{"id":4115170507,"package_name":"aiosignal","ecosystem":"pypi","requirements":"1.2.0","direct":false,"kind":"runtime","optional":false},{"id":4115170508,"package_name":"altair","ecosystem":"pypi","requirements":"4.1.0","direct":false,"kind":"runtime","optional":false},{"id":4115170509,"package_name":"appnope","ecosystem":"pypi","requirements":"0.1.2","direct":false,"kind":"runtime","optional":false},{"id":4115170510,"package_name":"argcomplete","ecosystem":"pypi","requirements":"1.12.3","direct":false,"kind":"runtime","optional":false},{"id":4115170511,"package_name":"argon2-cffi","ecosystem":"pypi","requirements":"21.1.0","direct":false,"kind":"runtime","optional":false},{"id":4115170512,"package_name":"astor","ecosystem":"pypi","requirements":"0.8.1","direct":false,"kind":"runtime","optional":false},{"id":4115170513,"package_name":"async-timeout","ecosystem":"pypi","requirements":"4.0.1","direct":false,"kind":"runtime","optional":false},{"id":4115170514,"package_name":"asynctest","ecosystem":"pypi","requirements":"0.13.0","direct":false,"kind":"runtime","optional":false},{"id":4115170515,"package_name":"atomicwrites","ecosystem":"pypi","requirements":"1.4.0","direct":false,"kind":"develop","optional":false},{"id":4115170516,"package_name":"attrs","ecosystem":"pypi","requirements":"21.2.0","direct":false,"kind":"runtime","optional":false},{"id":4115170517,"package_name":"backcall","ecosystem":"pypi","requirements":"0.2.0","direct":false,"kind":"runtime","optional":false},{"id":4115170518,"package_name":"backports.zoneinfo","ecosystem":"pypi","requirements":"0.2.1","direct":false,"kind":"runtime","optional":false},{"id":4115170519,"package_name":"base58","ecosystem":"pypi","requirements":"2.1.1","direct":false,"kind":"runtime","optional":false},{"id":4115170520,"package_name":"black","ecosystem":"pypi","requirements":"21.11b1","direct":false,"kind":"runtime","optional":false},{"id":4115170521,"package_name":"bleach","ecosystem":"pypi","requirements":"4.1.0","direct":false,"kind":"runtime","optional":false},{"id":4115170522,"package_name":"blinker","ecosystem":"pypi","requirements":"1.4","direct":false,"kind":"runtime","optional":false},{"id":4115170523,"package_name":"bokeh","ecosystem":"pypi","requirements":"2.2.2","direct":false,"kind":"runtime","optional":false},{"id":4115170524,"package_name":"cachetools","ecosystem":"pypi","requirements":"4.2.4","direct":false,"kind":"runtime","optional":false},{"id":4115170525,"package_name":"certifi","ecosystem":"pypi","requirements":"2021.10.8","direct":false,"kind":"runtime","optional":false},{"id":4115170526,"package_name":"cffi","ecosystem":"pypi","requirements":"1.15.0","direct":false,"kind":"runtime","optional":false},{"id":4115170527,"package_name":"charset-normalizer","ecosystem":"pypi","requirements":"2.0.7","direct":false,"kind":"runtime","optional":false},{"id":4115170528,"package_name":"click","ecosystem":"pypi","requirements":"7.1.2","direct":false,"kind":"runtime","optional":false},{"id":4115170529,"package_name":"colorama","ecosystem":"pypi","requirements":"0.4.4","direct":false,"kind":"runtime","optional":false},{"id":4115170530,"package_name":"datasets","ecosystem":"pypi","requirements":"1.14.0","direct":false,"kind":"runtime","optional":false},{"id":4115170531,"package_name":"debugpy","ecosystem":"pypi","requirements":"1.5.1","direct":false,"kind":"runtime","optional":false},{"id":4115170532,"package_name":"decorator","ecosystem":"pypi","requirements":"5.1.0","direct":false,"kind":"runtime","optional":false},{"id":4115170533,"package_name":"defusedxml","ecosystem":"pypi","requirements":"0.7.1","direct":false,"kind":"runtime","optional":false},{"id":4115170534,"package_name":"dill","ecosystem":"pypi","requirements":"0.3.4","direct":false,"kind":"runtime","optional":false},{"id":4115170535,"package_name":"embedding-lenses","ecosystem":"pypi","requirements":"0.9.0","direct":false,"kind":"runtime","optional":false},{"id":4115170536,"package_name":"entrypoints","ecosystem":"pypi","requirements":"0.3","direct":false,"kind":"runtime","optional":false},{"id":4115170537,"package_name":"filelock","ecosystem":"pypi","requirements":"3.4.0","direct":false,"kind":"runtime","optional":false},{"id":4115170538,"package_name":"flake8","ecosystem":"pypi","requirements":"4.0.1","direct":false,"kind":"runtime","optional":false},{"id":4115170539,"package_name":"frozenlist","ecosystem":"pypi","requirements":"1.2.0","direct":false,"kind":"runtime","optional":false},{"id":4115170540,"package_name":"fsspec","ecosystem":"pypi","requirements":"2021.11.1","direct":false,"kind":"runtime","optional":false},{"id":4115170541,"package_name":"gitdb","ecosystem":"pypi","requirements":"4.0.9","direct":false,"kind":"runtime","optional":false},{"id":4115170542,"package_name":"gitpython","ecosystem":"pypi","requirements":"3.1.24","direct":false,"kind":"runtime","optional":false},{"id":4115170543,"package_name":"huggingface-hub","ecosystem":"pypi","requirements":"0.0.19","direct":false,"kind":"runtime","optional":false},{"id":4115170544,"package_name":"idna","ecosystem":"pypi","requirements":"3.3","direct":false,"kind":"runtime","optional":false},{"id":4115170545,"package_name":"importlib-metadata","ecosystem":"pypi","requirements":"4.2.0","direct":false,"kind":"runtime","optional":false},{"id":4115170546,"package_name":"importlib-resources","ecosystem":"pypi","requirements":"5.4.0","direct":false,"kind":"runtime","optional":false},{"id":4115170547,"package_name":"ipykernel","ecosystem":"pypi","requirements":"6.5.1","direct":false,"kind":"runtime","optional":false},{"id":4115170548,"package_name":"ipython","ecosystem":"pypi","requirements":"7.29.0","direct":false,"kind":"runtime","optional":false},{"id":4115170549,"package_name":"ipython-genutils","ecosystem":"pypi","requirements":"0.2.0","direct":false,"kind":"runtime","optional":false},{"id":4115170550,"package_name":"ipywidgets","ecosystem":"pypi","requirements":"7.6.5","direct":false,"kind":"runtime","optional":false},{"id":4115170551,"package_name":"jedi","ecosystem":"pypi","requirements":"0.18.1","direct":false,"kind":"runtime","optional":false},{"id":4115170552,"package_name":"jinja2","ecosystem":"pypi","requirements":"3.0.3","direct":false,"kind":"runtime","optional":false},{"id":4115170553,"package_name":"joblib","ecosystem":"pypi","requirements":"1.1.0","direct":false,"kind":"runtime","optional":false},{"id":4115170554,"package_name":"jsonschema","ecosystem":"pypi","requirements":"4.2.1","direct":false,"kind":"runtime","optional":false},{"id":4115170555,"package_name":"jupyter-client","ecosystem":"pypi","requirements":"7.1.0","direct":false,"kind":"runtime","optional":false},{"id":4115170556,"package_name":"jupyter-core","ecosystem":"pypi","requirements":"4.9.1","direct":false,"kind":"runtime","optional":false},{"id":4115170557,"package_name":"jupyterlab-pygments","ecosystem":"pypi","requirements":"0.1.2","direct":false,"kind":"runtime","optional":false},{"id":4115170558,"package_name":"jupyterlab-widgets","ecosystem":"pypi","requirements":"1.0.2","direct":false,"kind":"runtime","optional":false},{"id":4115170559,"package_name":"kenlm","ecosystem":"pypi","requirements":"0.0.0","direct":false,"kind":"runtime","optional":false},{"id":4115170560,"package_name":"llvmlite","ecosystem":"pypi","requirements":"0.37.0","direct":false,"kind":"runtime","optional":false},{"id":4115170561,"package_name":"markupsafe","ecosystem":"pypi","requirements":"2.0.1","direct":false,"kind":"runtime","optional":false},{"id":4115170562,"package_name":"matplotlib-inline","ecosystem":"pypi","requirements":"0.1.3","direct":false,"kind":"runtime","optional":false},{"id":4115170563,"package_name":"mccabe","ecosystem":"pypi","requirements":"0.6.1","direct":false,"kind":"runtime","optional":false},{"id":4115170564,"package_name":"mistune","ecosystem":"pypi","requirements":"0.8.4","direct":false,"kind":"runtime","optional":false},{"id":4115170565,"package_name":"more-itertools","ecosystem":"pypi","requirements":"8.12.0","direct":false,"kind":"develop","optional":false},{"id":4115170566,"package_name":"multidict","ecosystem":"pypi","requirements":"5.2.0","direct":false,"kind":"runtime","optional":false},{"id":4115170567,"package_name":"multiprocess","ecosystem":"pypi","requirements":"0.70.12.2","direct":false,"kind":"runtime","optional":false},{"id":4115170568,"package_name":"mypy-extensions","ecosystem":"pypi","requirements":"0.4.3","direct":false,"kind":"runtime","optional":false},{"id":4115170569,"package_name":"nbclient","ecosystem":"pypi","requirements":"0.5.9","direct":false,"kind":"runtime","optional":false},{"id":4115170570,"package_name":"nbconvert","ecosystem":"pypi","requirements":"6.3.0","direct":false,"kind":"runtime","optional":false},{"id":4115170571,"package_name":"nbformat","ecosystem":"pypi","requirements":"5.1.3","direct":false,"kind":"runtime","optional":false},{"id":4115170572,"package_name":"nest-asyncio","ecosystem":"pypi","requirements":"1.5.1","direct":false,"kind":"runtime","optional":false},{"id":4115170573,"package_name":"nltk","ecosystem":"pypi","requirements":"3.6.5","direct":false,"kind":"runtime","optional":false},{"id":4115170574,"package_name":"notebook","ecosystem":"pypi","requirements":"6.4.6","direct":false,"kind":"runtime","optional":false},{"id":4115170575,"package_name":"numba","ecosystem":"pypi","requirements":"0.54.1","direct":false,"kind":"runtime","optional":false},{"id":4115170576,"package_name":"numpy","ecosystem":"pypi","requirements":"1.20.0","direct":false,"kind":"runtime","optional":false},{"id":4115170577,"package_name":"packaging","ecosystem":"pypi","requirements":"21.3","direct":false,"kind":"runtime","optional":false},{"id":4115170578,"package_name":"pandas","ecosystem":"pypi","requirements":"1.1.5","direct":false,"kind":"runtime","optional":false},{"id":4115170579,"package_name":"pandocfilters","ecosystem":"pypi","requirements":"1.5.0","direct":false,"kind":"runtime","optional":false},{"id":4115170580,"package_name":"parso","ecosystem":"pypi","requirements":"0.8.2","direct":false,"kind":"runtime","optional":false},{"id":4115170581,"package_name":"pathspec","ecosystem":"pypi","requirements":"0.9.0","direct":false,"kind":"runtime","optional":false},{"id":4115170582,"package_name":"pexpect","ecosystem":"pypi","requirements":"4.8.0","direct":false,"kind":"runtime","optional":false},{"id":4115170583,"package_name":"pickleshare","ecosystem":"pypi","requirements":"0.7.5","direct":false,"kind":"runtime","optional":false},{"id":4115170584,"package_name":"pillow","ecosystem":"pypi","requirements":"8.4.0","direct":false,"kind":"runtime","optional":false},{"id":4115170585,"package_name":"platformdirs","ecosystem":"pypi","requirements":"2.4.0","direct":false,"kind":"runtime","optional":false},{"id":4115170586,"package_name":"pluggy","ecosystem":"pypi","requirements":"0.13.1","direct":false,"kind":"develop","optional":false},{"id":4115170587,"package_name":"prometheus-client","ecosystem":"pypi","requirements":"0.12.0","direct":false,"kind":"runtime","optional":false},{"id":4115170588,"package_name":"prompt-toolkit","ecosystem":"pypi","requirements":"3.0.22","direct":false,"kind":"runtime","optional":false},{"id":4115170589,"package_name":"protobuf","ecosystem":"pypi","requirements":"3.19.1","direct":false,"kind":"runtime","optional":false},{"id":4115170590,"package_name":"ptyprocess","ecosystem":"pypi","requirements":"0.7.0","direct":false,"kind":"runtime","optional":false},{"id":4115170591,"package_name":"py","ecosystem":"pypi","requirements":"1.11.0","direct":false,"kind":"runtime","optional":false},{"id":4115170592,"package_name":"pyarrow","ecosystem":"pypi","requirements":"6.0.1","direct":false,"kind":"runtime","optional":false},{"id":4115170593,"package_name":"pycodestyle","ecosystem":"pypi","requirements":"2.8.0","direct":false,"kind":"runtime","optional":false},{"id":4115170594,"package_name":"pycparser","ecosystem":"pypi","requirements":"2.21","direct":false,"kind":"runtime","optional":false},{"id":4115170595,"package_name":"pydeck","ecosystem":"pypi","requirements":"0.7.1","direct":false,"kind":"runtime","optional":false},{"id":4115170596,"package_name":"pyflakes","ecosystem":"pypi","requirements":"2.4.0","direct":false,"kind":"runtime","optional":false},{"id":4115170597,"package_name":"pygments","ecosystem":"pypi","requirements":"2.10.0","direct":false,"kind":"runtime","optional":false},{"id":4115170598,"package_name":"pynndescent","ecosystem":"pypi","requirements":"0.5.5","direct":false,"kind":"runtime","optional":false},{"id":4115170599,"package_name":"pyparsing","ecosystem":"pypi","requirements":"3.0.6","direct":false,"kind":"runtime","optional":false},{"id":4115170600,"package_name":"pyrsistent","ecosystem":"pypi","requirements":"0.18.0","direct":false,"kind":"runtime","optional":false},{"id":4115170601,"package_name":"pytest","ecosystem":"pypi","requirements":"5.4.3","direct":false,"kind":"develop","optional":false},{"id":4115170602,"package_name":"python-dateutil","ecosystem":"pypi","requirements":"2.8.2","direct":false,"kind":"runtime","optional":false},{"id":4115170603,"package_name":"pytz","ecosystem":"pypi","requirements":"2021.3","direct":false,"kind":"runtime","optional":false},{"id":4115170604,"package_name":"pytz-deprecation-shim","ecosystem":"pypi","requirements":"0.1.0.post0","direct":false,"kind":"runtime","optional":false},{"id":4115170605,"package_name":"pywin32","ecosystem":"pypi","requirements":"302","direct":false,"kind":"runtime","optional":false},{"id":4115170606,"package_name":"pywinpty","ecosystem":"pypi","requirements":"1.1.6","direct":false,"kind":"runtime","optional":false},{"id":4115170607,"package_name":"pyyaml","ecosystem":"pypi","requirements":"6.0","direct":false,"kind":"runtime","optional":false},{"id":4115170608,"package_name":"pyzmq","ecosystem":"pypi","requirements":"22.3.0","direct":false,"kind":"runtime","optional":false},{"id":4115170609,"package_name":"regex","ecosystem":"pypi","requirements":"2021.11.10","direct":false,"kind":"runtime","optional":false},{"id":4115170610,"package_name":"requests","ecosystem":"pypi","requirements":"2.26.0","direct":false,"kind":"runtime","optional":false},{"id":4115170611,"package_name":"sacremoses","ecosystem":"pypi","requirements":"0.0.46","direct":false,"kind":"runtime","optional":false},{"id":4115170612,"package_name":"scikit-learn","ecosystem":"pypi","requirements":"0.24.2","direct":false,"kind":"runtime","optional":false},{"id":4115170613,"package_name":"scipy","ecosystem":"pypi","requirements":"1.7.2","direct":false,"kind":"runtime","optional":false},{"id":4115170614,"package_name":"send2trash","ecosystem":"pypi","requirements":"1.8.0","direct":false,"kind":"runtime","optional":false},{"id":4115170615,"package_name":"sentence-transformers","ecosystem":"pypi","requirements":"2.0.0","direct":false,"kind":"runtime","optional":false},{"id":4115170616,"package_name":"sentencepiece","ecosystem":"pypi","requirements":"0.1.96","direct":false,"kind":"runtime","optional":false},{"id":4115170617,"package_name":"six","ecosystem":"pypi","requirements":"1.16.0","direct":false,"kind":"runtime","optional":false},{"id":4115170618,"package_name":"smmap","ecosystem":"pypi","requirements":"5.0.0","direct":false,"kind":"runtime","optional":false},{"id":4115170619,"package_name":"streamlit","ecosystem":"pypi","requirements":"1.1.0","direct":false,"kind":"runtime","optional":false},{"id":4115170620,"package_name":"terminado","ecosystem":"pypi","requirements":"0.12.1","direct":false,"kind":"runtime","optional":false},{"id":4115170621,"package_name":"testpath","ecosystem":"pypi","requirements":"0.5.0","direct":false,"kind":"runtime","optional":false},{"id":4115170622,"package_name":"threadpoolctl","ecosystem":"pypi","requirements":"3.0.0","direct":false,"kind":"runtime","optional":false},{"id":4115170623,"package_name":"tokenizers","ecosystem":"pypi","requirements":"0.10.3","direct":false,"kind":"runtime","optional":false},{"id":4115170624,"package_name":"toml","ecosystem":"pypi","requirements":"0.10.2","direct":false,"kind":"runtime","optional":false},{"id":4115170625,"package_name":"tomli","ecosystem":"pypi","requirements":"1.2.2","direct":false,"kind":"runtime","optional":false},{"id":4115170626,"package_name":"toolz","ecosystem":"pypi","requirements":"0.11.2","direct":false,"kind":"runtime","optional":false},{"id":4115170627,"package_name":"torch","ecosystem":"pypi","requirements":"1.10.0","direct":false,"kind":"runtime","optional":false},{"id":4115170628,"package_name":"torchvision","ecosystem":"pypi","requirements":"0.11.1","direct":false,"kind":"runtime","optional":false},{"id":4115170629,"package_name":"tornado","ecosystem":"pypi","requirements":"6.1","direct":false,"kind":"runtime","optional":false},{"id":4115170630,"package_name":"tqdm","ecosystem":"pypi","requirements":"4.62.3","direct":false,"kind":"runtime","optional":false},{"id":4115170631,"package_name":"traitlets","ecosystem":"pypi","requirements":"5.1.1","direct":false,"kind":"runtime","optional":false},{"id":4115170632,"package_name":"transformers","ecosystem":"pypi","requirements":"4.11.3","direct":false,"kind":"runtime","optional":false},{"id":4115170633,"package_name":"typed-ast","ecosystem":"pypi","requirements":"1.5.0","direct":false,"kind":"runtime","optional":false},{"id":4115170634,"package_name":"typer","ecosystem":"pypi","requirements":"0.4.0","direct":false,"kind":"runtime","optional":false},{"id":4115170635,"package_name":"typing-extensions","ecosystem":"pypi","requirements":"4.0.0","direct":false,"kind":"runtime","optional":false},{"id":4115170636,"package_name":"tzdata","ecosystem":"pypi","requirements":"2021.5","direct":false,"kind":"runtime","optional":false},{"id":4115170637,"package_name":"tzlocal","ecosystem":"pypi","requirements":"4.1","direct":false,"kind":"runtime","optional":false},{"id":4115170638,"package_name":"umap-learn","ecosystem":"pypi","requirements":"0.5.2","direct":false,"kind":"runtime","optional":false},{"id":4115170639,"package_name":"urllib3","ecosystem":"pypi","requirements":"1.26.7","direct":false,"kind":"runtime","optional":false},{"id":4115170640,"package_name":"validators","ecosystem":"pypi","requirements":"0.18.2","direct":false,"kind":"runtime","optional":false},{"id":4115170641,"package_name":"watchdog","ecosystem":"pypi","requirements":"2.1.3","direct":false,"kind":"runtime","optional":false},{"id":4115170642,"package_name":"wcwidth","ecosystem":"pypi","requirements":"0.2.5","direct":false,"kind":"runtime","optional":false},{"id":4115170643,"package_name":"webencodings","ecosystem":"pypi","requirements":"0.5.1","direct":false,"kind":"runtime","optional":false},{"id":4115170644,"package_name":"widgetsnbextension","ecosystem":"pypi","requirements":"3.5.2","direct":false,"kind":"runtime","optional":false},{"id":4115170645,"package_name":"xxhash","ecosystem":"pypi","requirements":"2.0.2","direct":false,"kind":"runtime","optional":false},{"id":4115170646,"package_name":"yarl","ecosystem":"pypi","requirements":"1.7.2","direct":false,"kind":"runtime","optional":false},{"id":4115170647,"package_name":"zipp","ecosystem":"pypi","requirements":"3.6.0","direct":false,"kind":"runtime","optional":false}]},{"ecosystem":"pypi","filepath":"perplexity_lenses/pyproject.toml","sha":null,"kind":"manifest","created_at":"2022-09-07T15:12:55.517Z","updated_at":"2022-09-07T15:12:55.517Z","repository_link":"https://github.com/bigscience-workshop/data_tooling/blob/master/perplexity_lenses/pyproject.toml","dependencies":[{"id":4115170676,"package_name":"python","ecosystem":"pypi","requirements":"\u003e=3.7,\u003c3.10","direct":true,"kind":"runtime","optional":false},{"id":4115170677,"package_name":"huggingface-hub","ecosystem":"pypi","requirements":"0.0.19","direct":true,"kind":"runtime","optional":false},{"id":4115170678,"package_name":"streamlit","ecosystem":"pypi","requirements":"1.1.0","direct":true,"kind":"runtime","optional":false},{"id":4115170679,"package_name":"transformers","ecosystem":"pypi","requirements":"4.11.3","direct":true,"kind":"runtime","optional":false},{"id":4115170680,"package_name":"watchdog","ecosystem":"pypi","requirements":"2.1.3","direct":true,"kind":"runtime","optional":false},{"id":4115170681,"package_name":"sentence-transformers","ecosystem":"pypi","requirements":"2.0.0","direct":true,"kind":"runtime","optional":false},{"id":4115170682,"package_name":"bokeh","ecosystem":"pypi","requirements":"2.2.2","direct":true,"kind":"runtime","optional":false},{"id":4115170683,"package_name":"numpy","ecosystem":"pypi","requirements":"1.20.0","direct":true,"kind":"runtime","optional":false},{"id":4115170684,"package_name":"numba","ecosystem":"pypi","requirements":"^0.54.1","direct":true,"kind":"runtime","optional":false},{"id":4115170685,"package_name":"umap-learn","ecosystem":"pypi","requirements":"^0.5.2","direct":true,"kind":"runtime","optional":false},{"id":4115170686,"package_name":"datasets","ecosystem":"pypi","requirements":"1.14.0","direct":true,"kind":"runtime","optional":false},{"id":4115170687,"package_name":"black","ecosystem":"pypi","requirements":"^21.10b0","direct":true,"kind":"runtime","optional":false},{"id":4115170688,"package_name":"flake8","ecosystem":"pypi","requirements":"^4.0.1","direct":true,"kind":"runtime","optional":false},{"id":4115170689,"package_name":"scikit-learn","ecosystem":"pypi","requirements":"0.24.2","direct":true,"kind":"runtime","optional":false},{"id":4115170690,"package_name":"kenlm","ecosystem":"pypi","requirements":"*","direct":true,"kind":"runtime","optional":false},{"id":4115170691,"package_name":"embedding-lenses","ecosystem":"pypi","requirements":"0.9.0","direct":true,"kind":"runtime","optional":false},{"id":4115170692,"package_name":"typer","ecosystem":"pypi","requirements":"^0.4.0","direct":true,"kind":"runtime","optional":false},{"id":4115170693,"package_name":"pytest","ecosystem":"pypi","requirements":"^5.2","direct":true,"kind":"develop","optional":false}]},{"ecosystem":"pypi","filepath":"perplexity_lenses/requirements.txt","sha":null,"kind":"manifest","created_at":"2022-09-07T15:12:55.625Z","updated_at":"2022-09-07T15:12:55.625Z","repository_link":"https://github.com/bigscience-workshop/data_tooling/blob/master/perplexity_lenses/requirements.txt","dependencies":[{"id":4115170702,"package_name":"bokeh","ecosystem":"pypi","requirements":"==2.2.2","direct":true,"kind":"runtime","optional":false},{"id":4115170703,"package_name":"embedding-lenses","ecosystem":"pypi","requirements":"==0.9.0","direct":true,"kind":"runtime","optional":false},{"id":4115170704,"package_name":"huggingface-hub","ecosystem":"pypi","requirements":"==0.0.19","direct":true,"kind":"runtime","optional":false},{"id":4115170705,"package_name":"numpy","ecosystem":"pypi","requirements":"==1.20.0","direct":true,"kind":"runtime","optional":false},{"id":4115170706,"package_name":"sentence-transformers","ecosystem":"pypi","requirements":"==2.0.0","direct":true,"kind":"runtime","optional":false},{"id":4115170707,"package_name":"streamlit","ecosystem":"pypi","requirements":"==1.1.0","direct":true,"kind":"runtime","optional":false},{"id":4115170708,"package_name":"transformers","ecosystem":"pypi","requirements":"==4.11.3","direct":true,"kind":"runtime","optional":false},{"id":4115170709,"package_name":"typer","ecosystem":"pypi","requirements":"==0.4.0","direct":true,"kind":"runtime","optional":false},{"id":4115170710,"package_name":"umap-learn","ecosystem":"pypi","requirements":"==0.5.2","direct":true,"kind":"runtime","optional":false},{"id":4115170711,"package_name":"watchdog","ecosystem":"pypi","requirements":"==2.1.3","direct":true,"kind":"runtime","optional":false}]},{"ecosystem":"pypi","filepath":"pii-manager/requirements.txt","sha":null,"kind":"manifest","created_at":"2022-09-07T15:12:55.756Z","updated_at":"2022-09-07T15:12:55.756Z","repository_link":"https://github.com/bigscience-workshop/data_tooling/blob/master/pii-manager/requirements.txt","dependencies":[{"id":4115171629,"package_name":"python-stdnum","ecosystem":"pypi","requirements":"\u003e=1.17,\u003c2.0","direct":true,"kind":"runtime","optional":false},{"id":4115171630,"package_name":"regex","ecosystem":"pypi","requirements":"\u003e=2021.11.10","direct":true,"kind":"runtime","optional":false}]},{"ecosystem":"pypi","filepath":"poetry.lock","sha":null,"kind":"lockfile","created_at":"2022-09-07T15:12:55.873Z","updated_at":"2022-09-07T15:12:55.873Z","repository_link":"https://github.com/bigscience-workshop/data_tooling/blob/master/poetry.lock","dependencies":[{"id":4115173249,"package_name":"aiohttp","ecosystem":"pypi","requirements":"3.8.0","direct":false,"kind":"runtime","optional":false},{"id":4115173250,"package_name":"aiosignal","ecosystem":"pypi","requirements":"1.2.0","direct":false,"kind":"runtime","optional":false},{"id":4115173251,"package_name":"anyio","ecosystem":"pypi","requirements":"3.3.4","direct":false,"kind":"develop","optional":false},{"id":4115173252,"package_name":"appnope","ecosystem":"pypi","requirements":"0.1.2","direct":false,"kind":"develop","optional":false},{"id":4115173253,"package_name":"argcomplete","ecosystem":"pypi","requirements":"1.12.3","direct":false,"kind":"develop","optional":false},{"id":4115173254,"package_name":"argon2-cffi","ecosystem":"pypi","requirements":"21.1.0","direct":false,"kind":"develop","optional":false},{"id":4115173255,"package_name":"async-timeout","ecosystem":"pypi","requirements":"4.0.1","direct":false,"kind":"runtime","optional":false},{"id":4115173256,"package_name":"asynctest","ecosystem":"pypi","requirements":"0.13.0","direct":false,"kind":"runtime","optional":false},{"id":4115173257,"package_name":"atomicwrites","ecosystem":"pypi","requirements":"1.4.0","direct":false,"kind":"develop","optional":false},{"id":4115173258,"package_name":"attrs","ecosystem":"pypi","requirements":"21.2.0","direct":false,"kind":"runtime","optional":false},{"id":4115173259,"package_name":"babel","ecosystem":"pypi","requirements":"2.9.1","direct":false,"kind":"develop","optional":false},{"id":4115173260,"package_name":"backcall","ecosystem":"pypi","requirements":"0.2.0","direct":false,"kind":"develop","optional":false},{"id":4115173261,"package_name":"black","ecosystem":"pypi","requirements":"21.10b0","direct":false,"kind":"develop","optional":false},{"id":4115173262,"package_name":"bleach","ecosystem":"pypi","requirements":"4.1.0","direct":false,"kind":"develop","optional":false},{"id":4115173263,"package_name":"certifi","ecosystem":"pypi","requirements":"2021.10.8","direct":false,"kind":"runtime","optional":false},{"id":4115173264,"package_name":"cffi","ecosystem":"pypi","requirements":"1.15.0","direct":false,"kind":"develop","optional":false},{"id":4115173265,"package_name":"charset-normalizer","ecosystem":"pypi","requirements":"2.0.7","direct":false,"kind":"runtime","optional":false},{"id":4115173266,"package_name":"click","ecosystem":"pypi","requirements":"8.0.3","direct":false,"kind":"runtime","optional":false},{"id":4115173267,"package_name":"colorama","ecosystem":"pypi","requirements":"0.4.4","direct":false,"kind":"runtime","optional":false},{"id":4115173268,"package_name":"datasets","ecosystem":"pypi","requirements":"1.15.1","direct":false,"kind":"runtime","optional":false},{"id":4115173269,"package_name":"debugpy","ecosystem":"pypi","requirements":"1.5.1","direct":false,"kind":"develop","optional":false},{"id":4115173270,"package_name":"decorator","ecosystem":"pypi","requirements":"5.1.0","direct":false,"kind":"develop","optional":false},{"id":4115173271,"package_name":"defusedxml","ecosystem":"pypi","requirements":"0.7.1","direct":false,"kind":"develop","optional":false},{"id":4115173272,"package_name":"dill","ecosystem":"pypi","requirements":"0.3.4","direct":false,"kind":"runtime","optional":false},{"id":4115173273,"package_name":"entrypoints","ecosystem":"pypi","requirements":"0.3","direct":false,"kind":"develop","optional":false},{"id":4115173274,"package_name":"fancycompleter","ecosystem":"pypi","requirements":"0.9.1","direct":false,"kind":"develop","optional":false},{"id":4115173275,"package_name":"filelock","ecosystem":"pypi","requirements":"3.3.2","direct":false,"kind":"runtime","optional":false},{"id":4115173276,"package_name":"flake8","ecosystem":"pypi","requirements":"3.9.2","direct":false,"kind":"develop","optional":false},{"id":4115173277,"package_name":"frozenlist","ecosystem":"pypi","requirements":"1.2.0","direct":false,"kind":"runtime","optional":false},{"id":4115173278,"package_name":"fsspec","ecosystem":"pypi","requirements":"2021.11.0","direct":false,"kind":"runtime","optional":false},{"id":4115173279,"package_name":"huggingface-hub","ecosystem":"pypi","requirements":"0.1.2","direct":false,"kind":"runtime","optional":false},{"id":4115173280,"package_name":"idna","ecosystem":"pypi","requirements":"3.3","direct":false,"kind":"runtime","optional":false},{"id":4115173281,"package_name":"importlib-metadata","ecosystem":"pypi","requirements":"4.8.2","direct":false,"kind":"runtime","optional":false},{"id":4115173282,"package_name":"importlib-resources","ecosystem":"pypi","requirements":"5.4.0","direct":false,"kind":"develop","optional":false},{"id":4115173283,"package_name":"iniconfig","ecosystem":"pypi","requirements":"1.1.1","direct":false,"kind":"develop","optional":false},{"id":4115173284,"package_name":"ipykernel","ecosystem":"pypi","requirements":"6.5.0","direct":false,"kind":"develop","optional":false},{"id":4115173285,"package_name":"ipython","ecosystem":"pypi","requirements":"7.29.0","direct":false,"kind":"develop","optional":false},{"id":4115173286,"package_name":"ipython-genutils","ecosystem":"pypi","requirements":"0.2.0","direct":false,"kind":"develop","optional":false},{"id":4115173287,"package_name":"isort","ecosystem":"pypi","requirements":"5.10.1","direct":false,"kind":"develop","optional":false},{"id":4115173288,"package_name":"jedi","ecosystem":"pypi","requirements":"0.18.0","direct":false,"kind":"develop","optional":false},{"id":4115173289,"package_name":"jinja2","ecosystem":"pypi","requirements":"3.0.3","direct":false,"kind":"develop","optional":false},{"id":4115173290,"package_name":"joblib","ecosystem":"pypi","requirements":"1.1.0","direct":false,"kind":"runtime","optional":false},{"id":4115173291,"package_name":"json5","ecosystem":"pypi","requirements":"0.9.6","direct":false,"kind":"develop","optional":false},{"id":4115173292,"package_name":"jsonschema","ecosystem":"pypi","requirements":"4.2.1","direct":false,"kind":"develop","optional":false},{"id":4115173293,"package_name":"jupyter-client","ecosystem":"pypi","requirements":"7.0.6","direct":false,"kind":"develop","optional":false},{"id":4115173294,"package_name":"jupyter-core","ecosystem":"pypi","requirements":"4.9.1","direct":false,"kind":"develop","optional":false},{"id":4115173295,"package_name":"jupyter-server","ecosystem":"pypi","requirements":"1.11.2","direct":false,"kind":"develop","optional":false},{"id":4115173296,"package_name":"jupyterlab","ecosystem":"pypi","requirements":"3.2.3","direct":false,"kind":"develop","optional":false},{"id":4115173297,"package_name":"jupyterlab-pygments","ecosystem":"pypi","requirements":"0.1.2","direct":false,"kind":"develop","optional":false},{"id":4115173298,"package_name":"jupyterlab-server","ecosystem":"pypi","requirements":"2.8.2","direct":false,"kind":"develop","optional":false},{"id":4115173299,"package_name":"kenlm","ecosystem":"pypi","requirements":"0.0.0","direct":false,"kind":"runtime","optional":false},{"id":4115173300,"package_name":"markupsafe","ecosystem":"pypi","requirements":"2.0.1","direct":false,"kind":"develop","optional":false},{"id":4115173301,"package_name":"matplotlib-inline","ecosystem":"pypi","requirements":"0.1.3","direct":false,"kind":"develop","optional":false},{"id":4115173302,"package_name":"mccabe","ecosystem":"pypi","requirements":"0.6.1","direct":false,"kind":"develop","optional":false},{"id":4115173303,"package_name":"mistune","ecosystem":"pypi","requirements":"0.8.4","direct":false,"kind":"develop","optional":false},{"id":4115173304,"package_name":"multidict","ecosystem":"pypi","requirements":"5.2.0","direct":false,"kind":"runtime","optional":false},{"id":4115173305,"package_name":"multiprocess","ecosystem":"pypi","requirements":"0.70.12.2","direct":false,"kind":"runtime","optional":false},{"id":4115173306,"package_name":"mypy-extensions","ecosystem":"pypi","requirements":"0.4.3","direct":false,"kind":"develop","optional":false},{"id":4115173307,"package_name":"nbclassic","ecosystem":"pypi","requirements":"0.3.4","direct":false,"kind":"develop","optional":false},{"id":4115173308,"package_name":"nbclient","ecosystem":"pypi","requirements":"0.5.5","direct":false,"kind":"develop","optional":false},{"id":4115173309,"package_name":"nbconvert","ecosystem":"pypi","requirements":"6.2.0","direct":false,"kind":"develop","optional":false},{"id":4115173310,"package_name":"nbformat","ecosystem":"pypi","requirements":"5.1.3","direct":false,"kind":"develop","optional":false},{"id":4115173311,"package_name":"nest-asyncio","ecosystem":"pypi","requirements":"1.5.1","direct":false,"kind":"develop","optional":false},{"id":4115173312,"package_name":"nltk","ecosystem":"pypi","requirements":"3.6.5","direct":false,"kind":"runtime","optional":false},{"id":4115173313,"package_name":"notebook","ecosystem":"pypi","requirements":"6.4.5","direct":false,"kind":"develop","optional":false},{"id":4115173314,"package_name":"numpy","ecosystem":"pypi","requirements":"1.21.1","direct":false,"kind":"runtime","optional":false},{"id":4115173315,"package_name":"packaging","ecosystem":"pypi","requirements":"21.2","direct":false,"kind":"runtime","optional":false},{"id":4115173316,"package_name":"pandas","ecosystem":"pypi","requirements":"1.3.4","direct":false,"kind":"runtime","optional":false},{"id":4115173317,"package_name":"pandocfilters","ecosystem":"pypi","requirements":"1.5.0","direct":false,"kind":"develop","optional":false},{"id":4115173318,"package_name":"parso","ecosystem":"pypi","requirements":"0.8.2","direct":false,"kind":"develop","optional":false},{"id":4115173319,"package_name":"pathspec","ecosystem":"pypi","requirements":"0.9.0","direct":false,"kind":"develop","optional":false},{"id":4115173320,"package_name":"pdbpp","ecosystem":"pypi","requirements":"0.10.3","direct":false,"kind":"develop","optional":false},{"id":4115173321,"package_name":"pexpect","ecosystem":"pypi","requirements":"4.8.0","direct":false,"kind":"develop","optional":false},{"id":4115173322,"package_name":"pickleshare","ecosystem":"pypi","requirements":"0.7.5","direct":false,"kind":"develop","optional":false},{"id":4115173323,"package_name":"platformdirs","ecosystem":"pypi","requirements":"2.4.0","direct":false,"kind":"develop","optional":false},{"id":4115173324,"package_name":"pluggy","ecosystem":"pypi","requirements":"1.0.0","direct":false,"kind":"develop","optional":false},{"id":4115173325,"package_name":"prometheus-client","ecosystem":"pypi","requirements":"0.12.0","direct":false,"kind":"develop","optional":false},{"id":4115173326,"package_name":"prompt-toolkit","ecosystem":"pypi","requirements":"3.0.22","direct":false,"kind":"develop","optional":false},{"id":4115173327,"package_name":"ptyprocess","ecosystem":"pypi","requirements":"0.7.0","direct":false,"kind":"develop","optional":false},{"id":4115173328,"package_name":"py","ecosystem":"pypi","requirements":"1.11.0","direct":false,"kind":"develop","optional":false},{"id":4115173329,"package_name":"pyarrow","ecosystem":"pypi","requirements":"6.0.0","direct":false,"kind":"runtime","optional":false},{"id":4115173330,"package_name":"pycodestyle","ecosystem":"pypi","requirements":"2.7.0","direct":false,"kind":"develop","optional":false},{"id":4115173331,"package_name":"pycparser","ecosystem":"pypi","requirements":"2.21","direct":false,"kind":"develop","optional":false},{"id":4115173332,"package_name":"pyflakes","ecosystem":"pypi","requirements":"2.3.1","direct":false,"kind":"develop","optional":false},{"id":4115173333,"package_name":"pygments","ecosystem":"pypi","requirements":"2.10.0","direct":false,"kind":"develop","optional":false},{"id":4115173334,"package_name":"pyparsing","ecosystem":"pypi","requirements":"2.4.7","direct":false,"kind":"runtime","optional":false},{"id":4115173335,"package_name":"pyreadline","ecosystem":"pypi","requirements":"2.1","direct":false,"kind":"develop","optional":false},{"id":4115173336,"package_name":"pyrepl","ecosystem":"pypi","requirements":"0.9.0","direct":false,"kind":"develop","optional":false},{"id":4115173337,"package_name":"pyrsistent","ecosystem":"pypi","requirements":"0.18.0","direct":false,"kind":"develop","optional":false},{"id":4115173338,"package_name":"pytest","ecosystem":"pypi","requirements":"6.2.5","direct":false,"kind":"develop","optional":false},{"id":4115173339,"package_name":"python-dateutil","ecosystem":"pypi","requirements":"2.8.2","direct":false,"kind":"runtime","optional":false},{"id":4115173340,"package_name":"pytz","ecosystem":"pypi","requirements":"2021.3","direct":false,"kind":"runtime","optional":false},{"id":4115173341,"package_name":"pywin32","ecosystem":"pypi","requirements":"302","direct":false,"kind":"develop","optional":false},{"id":4115173342,"package_name":"pywinpty","ecosystem":"pypi","requirements":"1.1.5","direct":false,"kind":"develop","optional":false},{"id":4115173343,"package_name":"pyyaml","ecosystem":"pypi","requirements":"6.0","direct":false,"kind":"runtime","optional":false},{"id":4115173344,"package_name":"pyzmq","ecosystem":"pypi","requirements":"22.3.0","direct":false,"kind":"develop","optional":false},{"id":4115173345,"package_name":"regex","ecosystem":"pypi","requirements":"2021.11.10","direct":false,"kind":"runtime","optional":false},{"id":4115173346,"package_name":"requests","ecosystem":"pypi","requirements":"2.26.0","direct":false,"kind":"runtime","optional":false},{"id":4115173347,"package_name":"sacremoses","ecosystem":"pypi","requirements":"0.0.46","direct":false,"kind":"runtime","optional":false},{"id":4115173348,"package_name":"scikit-learn","ecosystem":"pypi","requirements":"1.0.1","direct":false,"kind":"runtime","optional":false},{"id":4115173349,"package_name":"scipy","ecosystem":"pypi","requirements":"1.6.1","direct":false,"kind":"runtime","optional":false},{"id":4115173350,"package_name":"send2trash","ecosystem":"pypi","requirements":"1.8.0","direct":false,"kind":"develop","optional":false},{"id":4115173351,"package_name":"simhash-py","ecosystem":"pypi","requirements":"0.4.0","direct":false,"kind":"runtime","optional":false},{"id":4115173352,"package_name":"six","ecosystem":"pypi","requirements":"1.16.0","direct":false,"kind":"runtime","optional":false},{"id":4115173353,"package_name":"sniffio","ecosystem":"pypi","requirements":"1.2.0","direct":false,"kind":"develop","optional":false},{"id":4115173354,"package_name":"terminado","ecosystem":"pypi","requirements":"0.12.1","direct":false,"kind":"develop","optional":false},{"id":4115173355,"package_name":"testpath","ecosystem":"pypi","requirements":"0.5.0","direct":false,"kind":"develop","optional":false},{"id":4115173356,"package_name":"threadpoolctl","ecosystem":"pypi","requirements":"3.0.0","direct":false,"kind":"runtime","optional":false},{"id":4115173357,"package_name":"tokenizers","ecosystem":"pypi","requirements":"0.10.3","direct":false,"kind":"runtime","optional":false},{"id":4115173358,"package_name":"toml","ecosystem":"pypi","requirements":"0.10.2","direct":false,"kind":"develop","optional":false},{"id":4115173359,"package_name":"tomli","ecosystem":"pypi","requirements":"1.2.2","direct":false,"kind":"develop","optional":false},{"id":4115173360,"package_name":"tornado","ecosystem":"pypi","requirements":"6.1","direct":false,"kind":"develop","optional":false},{"id":4115173361,"package_name":"tqdm","ecosystem":"pypi","requirements":"4.62.3","direct":false,"kind":"runtime","optional":false},{"id":4115173362,"package_name":"traitlets","ecosystem":"pypi","requirements":"5.1.1","direct":false,"kind":"develop","optional":false},{"id":4115173363,"package_name":"transformers","ecosystem":"pypi","requirements":"4.12.3","direct":false,"kind":"runtime","optional":false},{"id":4115173364,"package_name":"typed-ast","ecosystem":"pypi","requirements":"1.4.3","direct":false,"kind":"develop","optional":false},{"id":4115173365,"package_name":"typer","ecosystem":"pypi","requirements":"0.4.0","direct":false,"kind":"runtime","optional":false},{"id":4115173366,"package_name":"typing-extensions","ecosystem":"pypi","requirements":"3.10.0.2","direct":false,"kind":"runtime","optional":false},{"id":4115173367,"package_name":"urllib3","ecosystem":"pypi","requirements":"1.26.7","direct":false,"kind":"runtime","optional":false},{"id":4115173368,"package_name":"wcwidth","ecosystem":"pypi","requirements":"0.2.5","direct":false,"kind":"develop","optional":false},{"id":4115173369,"package_name":"webencodings","ecosystem":"pypi","requirements":"0.5.1","direct":false,"kind":"develop","optional":false},{"id":4115173370,"package_name":"websocket-client","ecosystem":"pypi","requirements":"1.2.1","direct":false,"kind":"develop","optional":false},{"id":4115173371,"package_name":"wmctrl","ecosystem":"pypi","requirements":"0.4","direct":false,"kind":"develop","optional":false},{"id":4115173372,"package_name":"xxhash","ecosystem":"pypi","requirements":"2.0.2","direct":false,"kind":"runtime","optional":false},{"id":4115173373,"package_name":"yarl","ecosystem":"pypi","requirements":"1.7.2","direct":false,"kind":"runtime","optional":false},{"id":4115173374,"package_name":"zipp","ecosystem":"pypi","requirements":"3.6.0","direct":false,"kind":"runtime","optional":false}]},{"ecosystem":"pypi","filepath":"pyproject.toml","sha":null,"kind":"manifest","created_at":"2022-09-07T15:12:56.060Z","updated_at":"2022-09-07T15:12:56.060Z","repository_link":"https://github.com/bigscience-workshop/data_tooling/blob/master/pyproject.toml","dependencies":[{"id":4115173923,"package_name":"python","ecosystem":"pypi","requirements":"^3.7.10","direct":true,"kind":"runtime","optional":false},{"id":4115173930,"package_name":"datasets","ecosystem":"pypi","requirements":"^1.12.1","direct":true,"kind":"runtime","optional":false},{"id":4115173934,"package_name":"transformers","ecosystem":"pypi","requirements":"^4.12.3","direct":true,"kind":"runtime","optional":false},{"id":4115173936,"package_name":"nltk","ecosystem":"pypi","requirements":"^3.6.5","direct":true,"kind":"runtime","optional":false},{"id":4115173940,"package_name":"scikit-learn","ecosystem":"pypi","requirements":"^1.0.1","direct":true,"kind":"runtime","optional":false},{"id":4115173943,"package_name":"fsspec","ecosystem":"pypi","requirements":"^2021.11.0","direct":true,"kind":"runtime","optional":false},{"id":4115173950,"package_name":"kenlm","ecosystem":"pypi","requirements":"*","direct":true,"kind":"runtime","optional":false},{"id":4115173953,"package_name":"typer","ecosystem":"pypi","requirements":"^0.4.0","direct":true,"kind":"runtime","optional":false},{"id":4115173955,"package_name":"regex","ecosystem":"pypi","requirements":"^2021.11.10","direct":true,"kind":"runtime","optional":false},{"id":4115173959,"package_name":"simhash-py","ecosystem":"pypi","requirements":"^0.4.0","direct":true,"kind":"runtime","optional":false},{"id":4115173962,"package_name":"PyYAML","ecosystem":"pypi","requirements":"^6.0","direct":true,"kind":"runtime","optional":false},{"id":4115173964,"package_name":"tqdm","ecosystem":"pypi","requirements":"^4.62.3","direct":true,"kind":"runtime","optional":false},{"id":4115173968,"package_name":"pdbpp","ecosystem":"pypi","requirements":"^0.10.2","direct":true,"kind":"develop","optional":false},{"id":4115173971,"package_name":"isort","ecosystem":"pypi","requirements":"^5.6.4","direct":true,"kind":"develop","optional":false},{"id":4115173973,"package_name":"flake8","ecosystem":"pypi","requirements":"^3.8.4","direct":true,"kind":"develop","optional":false},{"id":4115173976,"package_name":"black","ecosystem":"pypi","requirements":"^21.7b0","direct":true,"kind":"develop","optional":false},{"id":4115173979,"package_name":"pytest","ecosystem":"pypi","requirements":"^6.2.4","direct":true,"kind":"develop","optional":false},{"id":4115173982,"package_name":"jupyterlab","ecosystem":"pypi","requirements":"^3.0.16","direct":true,"kind":"develop","optional":false}]},{"ecosystem":"pypi","filepath":"requirements.txt","sha":null,"kind":"manifest","created_at":"2022-09-07T15:12:56.717Z","updated_at":"2022-09-07T15:12:56.717Z","repository_link":"https://github.com/bigscience-workshop/data_tooling/blob/master/requirements.txt","dependencies":[{"id":4115176672,"package_name":"dataset","ecosystem":"pypi","requirements":"\u003e=1.5.0","direct":true,"kind":"runtime","optional":false},{"id":4115176673,"package_name":"datasets","ecosystem":"pypi","requirements":"\u003e=1.8.0","direct":true,"kind":"runtime","optional":false},{"id":4115176674,"package_name":"fasttext","ecosystem":"pypi","requirements":"\u003e=0.9.2","direct":true,"kind":"runtime","optional":false},{"id":4115176675,"package_name":"fsspec","ecosystem":"pypi","requirements":"*","direct":true,"kind":"runtime","optional":false},{"id":4115176676,"package_name":"ftfy","ecosystem":"pypi","requirements":"*","direct":true,"kind":"runtime","optional":false},{"id":4115176677,"package_name":"indexed_gzip","ecosystem":"pypi","requirements":"\u003e=1.6.1","direct":true,"kind":"runtime","optional":false},{"id":4115176678,"package_name":"langid","ecosystem":"pypi","requirements":"\u003e=1.1.6","direct":true,"kind":"runtime","optional":false},{"id":4115176679,"package_name":"nltk","ecosystem":"pypi","requirements":"*","direct":true,"kind":"runtime","optional":false},{"id":4115176680,"package_name":"scikit-learn","ecosystem":"pypi","requirements":"*","direct":true,"kind":"runtime","optional":false},{"id":4115176681,"package_name":"sentencepiece","ecosystem":"pypi","requirements":"*","direct":true,"kind":"runtime","optional":false},{"id":4115176682,"package_name":"sqlalchemy","ecosystem":"pypi","requirements":"\u003e=1.4.20","direct":true,"kind":"runtime","optional":false},{"id":4115176683,"package_name":"transformers","ecosystem":"pypi","requirements":"*","direct":true,"kind":"runtime","optional":false},{"id":4115176684,"package_name":"wordfreq","ecosystem":"pypi","requirements":"*","direct":true,"kind":"runtime","optional":false}]},{"ecosystem":"pypi","filepath":"tokenizer/python_script/requirements.txt","sha":null,"kind":"manifest","created_at":"2022-09-07T15:12:56.804Z","updated_at":"2022-09-07T15:12:56.804Z","repository_link":"https://github.com/bigscience-workshop/data_tooling/blob/master/tokenizer/python_script/requirements.txt","dependencies":[{"id":4115176844,"package_name":"datasets","ecosystem":"pypi","requirements":"\u003e=1.18.0","direct":true,"kind":"runtime","optional":false},{"id":4115176845,"package_name":"pyarrow","ecosystem":"pypi","requirements":"\u003e=6.0.0","direct":true,"kind":"runtime","optional":false}]},{"ecosystem":"actions","filepath":".github/workflows/add-issue-to-project.yml","sha":null,"kind":"manifest","created_at":"2023-01-29T22:30:28.787Z","updated_at":"2023-01-29T22:30:28.787Z","repository_link":"https://github.com/bigscience-workshop/data_tooling/blob/master/.github/workflows/add-issue-to-project.yml","dependencies":[{"id":7233184765,"package_name":"tibdex/github-app-token","ecosystem":"actions","requirements":"36464acb844fc53b9b8b2401da68844f6b05ebb0","direct":true,"kind":"composite","optional":false}]},{"ecosystem":"actions","filepath":".github/workflows/pii-manager.yml","sha":null,"kind":"manifest","created_at":"2023-01-29T22:30:29.010Z","updated_at":"2023-01-29T22:30:29.010Z","repository_link":"https://github.com/bigscience-workshop/data_tooling/blob/master/.github/workflows/pii-manager.yml","dependencies":[{"id":7233184808,"package_name":"actions/setup-python","ecosystem":"actions","requirements":"v1","direct":true,"kind":"composite","optional":false},{"id":7233184809,"package_name":"actions/checkout","ecosystem":"actions","requirements":"v2","direct":true,"kind":"composite","optional":false}]},{"ecosystem":"docker","filepath":"index_search/docker-compose.yml","sha":null,"kind":"manifest","created_at":"2023-01-29T22:30:29.101Z","updated_at":"2023-01-29T22:30:29.101Z","repository_link":"https://github.com/bigscience-workshop/data_tooling/blob/master/index_search/docker-compose.yml","dependencies":[{"id":7233184931,"package_name":"docker.elastic.co/elasticsearch/elasticsearch","ecosystem":"docker","requirements":"7.13.2","direct":true,"kind":"runtime","optional":false},{"id":7233184932,"package_name":"docker.elastic.co/kibana/kibana","ecosystem":"docker","requirements":"7.13.2","direct":true,"kind":"runtime","optional":false}]}],"score":null,"created_at":"2025-09-09T00:23:01.064Z","updated_at":"2025-10-07T08:36:45.011Z","avatar_url":"https://github.com/bigscience-workshop.png","language":"HTML","category":null,"sub_category":null,"monthly_downloads":0,"funding_links":[],"readme_doi_urls":[],"works":{},"citation_counts":{},"total_citations":0,"keywords_from_contributors":[],"project_url":"https://science.ecosyste.ms/api/v1/projects/146789","html_url":"https://science.ecosyste.ms/projects/146789"}